1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512F-ONLY,AVX512F-SLOW,FALLBACK0 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512F-ONLY,AVX512F-FAST,FALLBACK1 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ,AVX512DQ-SLOW,FALLBACK2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512DQ,AVX512DQ-FAST,FALLBACK3 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512BW-ONLY,AVX512BW-SLOW,FALLBACK4 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512BW-ONLY,AVX512BW-FAST,FALLBACK5 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512VBMI-ONLY,AVX512VBMI-SLOW,FALLBACK6 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512vbmi,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512VBMI-ONLY,AVX512VBMI-FAST,FALLBACK7 10 11define void @mask_replication_factor2_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 12; AVX512F-ONLY-LABEL: mask_replication_factor2_vf2: 13; AVX512F-ONLY: # %bb.0: 14; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 15; AVX512F-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 16; AVX512F-ONLY-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 17; AVX512F-ONLY-NEXT: vpmovsxdq %xmm0, %xmm0 18; AVX512F-ONLY-NEXT: vptestmd %xmm0, %xmm0, %k1 19; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %xmm0 {%k1} {z} 20; AVX512F-ONLY-NEXT: vmovdqa %xmm0, (%rdx) 21; AVX512F-ONLY-NEXT: retq 22; 23; AVX512DQ-LABEL: mask_replication_factor2_vf2: 24; AVX512DQ: # %bb.0: 25; AVX512DQ-NEXT: kmovw (%rdi), %k0 26; AVX512DQ-NEXT: vpmovm2d %k0, %xmm0 27; AVX512DQ-NEXT: vpmovsxdq %xmm0, %xmm0 28; AVX512DQ-NEXT: vpmovd2m %xmm0, %k1 29; AVX512DQ-NEXT: vmovdqa32 (%rsi), %xmm0 {%k1} {z} 30; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx) 31; AVX512DQ-NEXT: retq 32; 33; AVX512BW-LABEL: mask_replication_factor2_vf2: 34; AVX512BW: # %bb.0: 35; AVX512BW-NEXT: kmovq (%rdi), %k1 36; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 37; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 38; AVX512BW-NEXT: vpmovsxdq %xmm0, %xmm0 39; AVX512BW-NEXT: vptestmd %xmm0, %xmm0, %k1 40; AVX512BW-NEXT: vmovdqa32 (%rsi), %xmm0 {%k1} {z} 41; AVX512BW-NEXT: vmovdqa %xmm0, (%rdx) 42; AVX512BW-NEXT: retq 43 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 44 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1> 45 %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1> 46 %data = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %in.vec, i32 64, <4 x i1> %tgt.mask, <4 x i32> poison) 47 %data.padded = shufflevector <4 x i32> %data, <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 48 store <4 x i32> %data, ptr %out.vec, align 64 49 ret void 50} 51 52define void @mask_replication_factor2_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 53; AVX512F-ONLY-LABEL: mask_replication_factor2_vf4: 54; AVX512F-ONLY: # %bb.0: 55; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 56; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 57; AVX512F-ONLY-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 58; AVX512F-ONLY-NEXT: vpmovsxdq %xmm0, %ymm0 59; AVX512F-ONLY-NEXT: vptestmd %ymm0, %ymm0, %k1 60; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} 61; AVX512F-ONLY-NEXT: vmovdqa %ymm0, (%rdx) 62; AVX512F-ONLY-NEXT: vzeroupper 63; AVX512F-ONLY-NEXT: retq 64; 65; AVX512DQ-LABEL: mask_replication_factor2_vf4: 66; AVX512DQ: # %bb.0: 67; AVX512DQ-NEXT: kmovb (%rdi), %k0 68; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0 69; AVX512DQ-NEXT: vpmovsxdq %xmm0, %ymm0 70; AVX512DQ-NEXT: vpmovd2m %ymm0, %k1 71; AVX512DQ-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} 72; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) 73; AVX512DQ-NEXT: vzeroupper 74; AVX512DQ-NEXT: retq 75; 76; AVX512BW-LABEL: mask_replication_factor2_vf4: 77; AVX512BW: # %bb.0: 78; AVX512BW-NEXT: kmovw (%rdi), %k1 79; AVX512BW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 80; AVX512BW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 81; AVX512BW-NEXT: vpmovsxdq %xmm0, %ymm0 82; AVX512BW-NEXT: vptestmd %ymm0, %ymm0, %k1 83; AVX512BW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} 84; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx) 85; AVX512BW-NEXT: vzeroupper 86; AVX512BW-NEXT: retq 87 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 88 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 89 %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3> 90 %data = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %in.vec, i32 64, <8 x i1> %tgt.mask, <8 x i32> poison) 91 %data.padded = shufflevector <8 x i32> %data, <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 92 store <8 x i32> %data, ptr %out.vec, align 64 93 ret void 94} 95 96define void @mask_replication_factor2_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 97; AVX512F-ONLY-LABEL: mask_replication_factor2_vf8: 98; AVX512F-ONLY: # %bb.0: 99; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 100; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 101; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 102; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 103; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 104; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 105; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) 106; AVX512F-ONLY-NEXT: vzeroupper 107; AVX512F-ONLY-NEXT: retq 108; 109; AVX512DQ-LABEL: mask_replication_factor2_vf8: 110; AVX512DQ: # %bb.0: 111; AVX512DQ-NEXT: kmovb (%rdi), %k0 112; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 113; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 114; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 115; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 116; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 117; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) 118; AVX512DQ-NEXT: vzeroupper 119; AVX512DQ-NEXT: retq 120; 121; AVX512BW-LABEL: mask_replication_factor2_vf8: 122; AVX512BW: # %bb.0: 123; AVX512BW-NEXT: kmovw (%rdi), %k1 124; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 125; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 126; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 127; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 128; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 129; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) 130; AVX512BW-NEXT: vzeroupper 131; AVX512BW-NEXT: retq 132 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 133 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 134 %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7> 135 %data = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr %in.vec, i32 64, <16 x i1> %tgt.mask, <16 x i32> poison) 136 store <16 x i32> %data, ptr %out.vec, align 64 137 ret void 138} 139 140define void @mask_replication_factor2_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 141; AVX512F-ONLY-LABEL: mask_replication_factor2_vf16: 142; AVX512F-ONLY: # %bb.0: 143; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 144; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 145; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 146; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 147; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 148; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 149; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 150; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 151; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} 152; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} 153; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) 154; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) 155; AVX512F-ONLY-NEXT: vzeroupper 156; AVX512F-ONLY-NEXT: retq 157; 158; AVX512DQ-LABEL: mask_replication_factor2_vf16: 159; AVX512DQ: # %bb.0: 160; AVX512DQ-NEXT: kmovw (%rdi), %k0 161; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 162; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 163; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 164; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 165; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 166; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 167; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 168; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} 169; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} 170; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) 171; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) 172; AVX512DQ-NEXT: vzeroupper 173; AVX512DQ-NEXT: retq 174; 175; AVX512BW-LABEL: mask_replication_factor2_vf16: 176; AVX512BW: # %bb.0: 177; AVX512BW-NEXT: kmovw (%rdi), %k0 178; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 179; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 180; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 181; AVX512BW-NEXT: vpmovw2m %zmm0, %k1 182; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 183; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 184; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} 185; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) 186; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) 187; AVX512BW-NEXT: vzeroupper 188; AVX512BW-NEXT: retq 189 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 190 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 191 %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <32 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15> 192 %data = call <32 x i32> @llvm.masked.load.v32i32.p0(ptr %in.vec, i32 64, <32 x i1> %tgt.mask, <32 x i32> poison) 193 store <32 x i32> %data, ptr %out.vec, align 64 194 ret void 195} 196 197define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 198; AVX512F-ONLY-LABEL: mask_replication_factor2_vf32: 199; AVX512F-ONLY: # %bb.0: 200; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 201; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k2 202; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 203; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 204; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 205; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k2 206; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 207; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 208; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 209; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 210; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 211; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 212; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 213; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 214; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} 215; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} 216; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} 217; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} 218; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) 219; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) 220; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) 221; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) 222; AVX512F-ONLY-NEXT: vzeroupper 223; AVX512F-ONLY-NEXT: retq 224; 225; AVX512DQ-LABEL: mask_replication_factor2_vf32: 226; AVX512DQ: # %bb.0: 227; AVX512DQ-NEXT: kmovw (%rdi), %k0 228; AVX512DQ-NEXT: kmovw 2(%rdi), %k1 229; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 230; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 231; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 232; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 233; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 234; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 235; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 236; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 237; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 238; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 239; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 240; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 241; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} 242; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} 243; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} 244; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} 245; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) 246; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) 247; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) 248; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) 249; AVX512DQ-NEXT: vzeroupper 250; AVX512DQ-NEXT: retq 251; 252; AVX512BW-ONLY-LABEL: mask_replication_factor2_vf32: 253; AVX512BW-ONLY: # %bb.0: 254; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 255; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 256; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3] 257; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 258; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 259; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2 260; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} 261; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} 262; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 263; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} 264; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 265; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} 266; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) 267; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) 268; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) 269; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) 270; AVX512BW-ONLY-NEXT: vzeroupper 271; AVX512BW-ONLY-NEXT: retq 272; 273; AVX512VBMI-ONLY-LABEL: mask_replication_factor2_vf32: 274; AVX512VBMI-ONLY: # %bb.0: 275; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 276; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 277; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 278; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 279; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 280; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 281; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} 282; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} 283; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 284; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} 285; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 286; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} 287; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) 288; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) 289; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) 290; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) 291; AVX512VBMI-ONLY-NEXT: vzeroupper 292; AVX512VBMI-ONLY-NEXT: retq 293 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 294 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 295 %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31> 296 %data = call <64 x i32> @llvm.masked.load.v64i32.p0(ptr %in.vec, i32 64, <64 x i1> %tgt.mask, <64 x i32> poison) 297 store <64 x i32> %data, ptr %out.vec, align 64 298 ret void 299} 300 301define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 302; AVX512F-ONLY-LABEL: mask_replication_factor2_vf64: 303; AVX512F-ONLY: # %bb.0: 304; AVX512F-ONLY-NEXT: kmovw (%rdi), %k3 305; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k5 306; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k4 307; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 308; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 309; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 310; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 311; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 312; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 313; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 314; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 315; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 316; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k4} {z} = -1 317; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm3 318; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k4 319; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 320; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k6 321; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k5} {z} = -1 322; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm3 323; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k5 324; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 325; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k7 326; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k3} {z} = -1 327; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 328; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 329; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 330; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 331; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} 332; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} 333; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k7} {z} 334; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} 335; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k6} {z} 336; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k4} {z} 337; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} 338; AVX512F-ONLY-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 339; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} 340; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) 341; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) 342; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) 343; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) 344; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) 345; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) 346; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) 347; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) 348; AVX512F-ONLY-NEXT: vzeroupper 349; AVX512F-ONLY-NEXT: retq 350; 351; AVX512DQ-LABEL: mask_replication_factor2_vf64: 352; AVX512DQ: # %bb.0: 353; AVX512DQ-NEXT: kmovw (%rdi), %k0 354; AVX512DQ-NEXT: kmovw 2(%rdi), %k5 355; AVX512DQ-NEXT: kmovw 4(%rdi), %k3 356; AVX512DQ-NEXT: kmovw 6(%rdi), %k1 357; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 358; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 359; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 360; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 361; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 362; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 363; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 364; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 365; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0 366; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm3 367; AVX512DQ-NEXT: vpmovd2m %zmm3, %k3 368; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 369; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 370; AVX512DQ-NEXT: vpmovm2d %k5, %zmm0 371; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm3 372; AVX512DQ-NEXT: vpmovd2m %zmm3, %k5 373; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 374; AVX512DQ-NEXT: vpmovd2m %zmm0, %k6 375; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 376; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 377; AVX512DQ-NEXT: vpmovd2m %zmm1, %k7 378; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 379; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 380; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} 381; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z} 382; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z} 383; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} 384; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} 385; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} 386; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} 387; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 388; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} 389; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rdx) 390; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rdx) 391; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx) 392; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rdx) 393; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) 394; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) 395; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) 396; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) 397; AVX512DQ-NEXT: vzeroupper 398; AVX512DQ-NEXT: retq 399; 400; AVX512BW-ONLY-LABEL: mask_replication_factor2_vf64: 401; AVX512BW-ONLY: # %bb.0: 402; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 403; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 404; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,6,7,6,7] 405; AVX512BW-ONLY-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 406; AVX512BW-ONLY-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 407; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 408; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1 409; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3] 410; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 411; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2 412; AVX512BW-ONLY-NEXT: kshiftrq $16, %k2, %k3 413; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} 414; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} 415; AVX512BW-ONLY-NEXT: kshiftrq $48, %k2, %k3 416; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} 417; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 418; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} 419; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2 420; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} 421; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} 422; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 423; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} 424; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 425; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} 426; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) 427; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) 428; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) 429; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) 430; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) 431; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) 432; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) 433; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) 434; AVX512BW-ONLY-NEXT: vzeroupper 435; AVX512BW-ONLY-NEXT: retq 436; 437; AVX512VBMI-ONLY-LABEL: mask_replication_factor2_vf64: 438; AVX512VBMI-ONLY: # %bb.0: 439; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 440; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 441; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 442; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 443; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1 444; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 445; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 446; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2 447; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k2, %k3 448; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} 449; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} 450; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k2, %k3 451; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} 452; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 453; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} 454; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 455; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} 456; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} 457; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 458; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} 459; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 460; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} 461; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) 462; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) 463; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) 464; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) 465; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) 466; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) 467; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) 468; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) 469; AVX512VBMI-ONLY-NEXT: vzeroupper 470; AVX512VBMI-ONLY-NEXT: retq 471 %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 472 %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <128 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31, i32 32, i32 32, i32 33, i32 33, i32 34, i32 34, i32 35, i32 35, i32 36, i32 36, i32 37, i32 37, i32 38, i32 38, i32 39, i32 39, i32 40, i32 40, i32 41, i32 41, i32 42, i32 42, i32 43, i32 43, i32 44, i32 44, i32 45, i32 45, i32 46, i32 46, i32 47, i32 47, i32 48, i32 48, i32 49, i32 49, i32 50, i32 50, i32 51, i32 51, i32 52, i32 52, i32 53, i32 53, i32 54, i32 54, i32 55, i32 55, i32 56, i32 56, i32 57, i32 57, i32 58, i32 58, i32 59, i32 59, i32 60, i32 60, i32 61, i32 61, i32 62, i32 62, i32 63, i32 63> 473 %data = call <128 x i32> @llvm.masked.load.v128i32.p0(ptr %in.vec, i32 64, <128 x i1> %tgt.mask, <128 x i32> poison) 474 store <128 x i32> %data, ptr %out.vec, align 64 475 ret void 476} 477 478define void @mask_replication_factor3_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 479; AVX512F-ONLY-LABEL: mask_replication_factor3_vf2: 480; AVX512F-ONLY: # %bb.0: 481; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 482; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 483; AVX512F-ONLY-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 484; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,1,0,0] 485; AVX512F-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0 486; AVX512F-ONLY-NEXT: vpslld $31, %ymm0, %ymm0 487; AVX512F-ONLY-NEXT: movb $63, %al 488; AVX512F-ONLY-NEXT: kmovw %eax, %k1 489; AVX512F-ONLY-NEXT: vptestmd %ymm0, %ymm0, %k1 {%k1} 490; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} 491; AVX512F-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm1 492; AVX512F-ONLY-NEXT: vmovq %xmm1, 16(%rdx) 493; AVX512F-ONLY-NEXT: vmovdqa %xmm0, (%rdx) 494; AVX512F-ONLY-NEXT: vzeroupper 495; AVX512F-ONLY-NEXT: retq 496; 497; AVX512DQ-LABEL: mask_replication_factor3_vf2: 498; AVX512DQ: # %bb.0: 499; AVX512DQ-NEXT: kmovb (%rdi), %k0 500; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0 501; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,1,0,0] 502; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 503; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 504; AVX512DQ-NEXT: movb $63, %al 505; AVX512DQ-NEXT: kmovw %eax, %k1 506; AVX512DQ-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 {%k1} 507; AVX512DQ-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} 508; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 509; AVX512DQ-NEXT: vmovq %xmm1, 16(%rdx) 510; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx) 511; AVX512DQ-NEXT: vzeroupper 512; AVX512DQ-NEXT: retq 513; 514; AVX512BW-LABEL: mask_replication_factor3_vf2: 515; AVX512BW: # %bb.0: 516; AVX512BW-NEXT: kmovw (%rdi), %k1 517; AVX512BW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 518; AVX512BW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 519; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,1,1,1,0,0] 520; AVX512BW-NEXT: vpermd %ymm0, %ymm1, %ymm0 521; AVX512BW-NEXT: vpslld $31, %ymm0, %ymm0 522; AVX512BW-NEXT: movb $63, %al 523; AVX512BW-NEXT: kmovd %eax, %k1 524; AVX512BW-NEXT: vptestmd %ymm0, %ymm0, %k1 {%k1} 525; AVX512BW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} 526; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 527; AVX512BW-NEXT: vmovq %xmm1, 16(%rdx) 528; AVX512BW-NEXT: vmovdqa %xmm0, (%rdx) 529; AVX512BW-NEXT: vzeroupper 530; AVX512BW-NEXT: retq 531 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 532 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1> 533 %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <6 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1> 534 %data = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr %in.vec, i32 64, <6 x i1> %tgt.mask, <6 x i32> poison) 535 %data.padded = shufflevector <6 x i32> %data, <6 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 536 store <6 x i32> %data, ptr %out.vec, align 64 537 ret void 538} 539 540define void @mask_replication_factor3_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 541; AVX512F-ONLY-LABEL: mask_replication_factor3_vf4: 542; AVX512F-ONLY: # %bb.0: 543; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 544; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 545; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,0,0,0,0] 546; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 547; AVX512F-ONLY-NEXT: vpslld $31, %zmm0, %zmm0 548; AVX512F-ONLY-NEXT: movw $4095, %ax # imm = 0xFFF 549; AVX512F-ONLY-NEXT: kmovw %eax, %k1 550; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} 551; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 552; AVX512F-ONLY-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx) 553; AVX512F-ONLY-NEXT: vmovdqa %ymm0, (%rdx) 554; AVX512F-ONLY-NEXT: vzeroupper 555; AVX512F-ONLY-NEXT: retq 556; 557; AVX512DQ-LABEL: mask_replication_factor3_vf4: 558; AVX512DQ: # %bb.0: 559; AVX512DQ-NEXT: kmovw (%rdi), %k0 560; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 561; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,0,0,0,0] 562; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 563; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 564; AVX512DQ-NEXT: movw $4095, %ax # imm = 0xFFF 565; AVX512DQ-NEXT: kmovw %eax, %k1 566; AVX512DQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 {%k1} 567; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 568; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx) 569; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) 570; AVX512DQ-NEXT: vzeroupper 571; AVX512DQ-NEXT: retq 572; 573; AVX512BW-LABEL: mask_replication_factor3_vf4: 574; AVX512BW: # %bb.0: 575; AVX512BW-NEXT: kmovw (%rdi), %k1 576; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 577; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,0,0,0,0] 578; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 579; AVX512BW-NEXT: vpslld $31, %zmm0, %zmm0 580; AVX512BW-NEXT: movw $4095, %ax # imm = 0xFFF 581; AVX512BW-NEXT: kmovd %eax, %k1 582; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} 583; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 584; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx) 585; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx) 586; AVX512BW-NEXT: vzeroupper 587; AVX512BW-NEXT: retq 588 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 589 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 590 %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3> 591 %data = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %in.vec, i32 64, <12 x i1> %tgt.mask, <12 x i32> poison) 592 %data.padded = shufflevector <12 x i32> %data, <12 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef> 593 store <12 x i32> %data, ptr %out.vec, align 64 594 ret void 595} 596 597define void @mask_replication_factor3_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 598; AVX512F-ONLY-LABEL: mask_replication_factor3_vf8: 599; AVX512F-ONLY: # %bb.0: 600; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 601; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 602; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] 603; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 604; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 605; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1 606; AVX512F-ONLY-NEXT: movw $1, %ax 607; AVX512F-ONLY-NEXT: kmovw %eax, %k2 608; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} 609; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 610; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 611; AVX512F-ONLY-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 612; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,5,6,6,6,7,7,7] 613; AVX512F-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0 614; AVX512F-ONLY-NEXT: vptestmd %ymm0, %ymm0, %k1 615; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} 616; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} 617; AVX512F-ONLY-NEXT: vmovdqa %ymm1, 64(%rdx) 618; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) 619; AVX512F-ONLY-NEXT: vzeroupper 620; AVX512F-ONLY-NEXT: retq 621; 622; AVX512DQ-LABEL: mask_replication_factor3_vf8: 623; AVX512DQ: # %bb.0: 624; AVX512DQ-NEXT: kmovb (%rdi), %k0 625; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 626; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] 627; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 628; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 629; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1 630; AVX512DQ-NEXT: movw $1, %ax 631; AVX512DQ-NEXT: kmovw %eax, %k1 632; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 633; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 634; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0 635; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,5,6,6,6,7,7,7] 636; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 637; AVX512DQ-NEXT: vpmovd2m %ymm0, %k2 638; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 639; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} 640; AVX512DQ-NEXT: vmovdqa %ymm1, 64(%rdx) 641; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) 642; AVX512DQ-NEXT: vzeroupper 643; AVX512DQ-NEXT: retq 644; 645; AVX512BW-LABEL: mask_replication_factor3_vf8: 646; AVX512BW: # %bb.0: 647; AVX512BW-NEXT: kmovw (%rdi), %k0 648; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 649; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,0,0,0,0,0,0,0,0] 650; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 651; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 652; AVX512BW-NEXT: movl $16777215, %eax # imm = 0xFFFFFF 653; AVX512BW-NEXT: kmovd %eax, %k1 654; AVX512BW-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 {%k1} 655; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 656; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} 657; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} 658; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) 659; AVX512BW-NEXT: vmovdqa %ymm0, 64(%rdx) 660; AVX512BW-NEXT: vzeroupper 661; AVX512BW-NEXT: retq 662 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 663 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 664 %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7> 665 %data = call <24 x i32> @llvm.masked.load.v24i32.p0(ptr %in.vec, i32 64, <24 x i1> %tgt.mask, <24 x i32> poison) 666 %data.padded = shufflevector <24 x i32> %data, <24 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 667 store <24 x i32> %data, ptr %out.vec, align 64 668 ret void 669} 670 671define void @mask_replication_factor3_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 672; AVX512F-ONLY-LABEL: mask_replication_factor3_vf16: 673; AVX512F-ONLY: # %bb.0: 674; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 675; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 676; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] 677; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 678; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 679; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 680; AVX512F-ONLY-NEXT: movw $1, %ax 681; AVX512F-ONLY-NEXT: kmovw %eax, %k1 682; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 683; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 684; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] 685; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 686; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 687; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] 688; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 689; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 690; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 691; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z} 692; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} 693; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) 694; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 128(%rdx) 695; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) 696; AVX512F-ONLY-NEXT: vzeroupper 697; AVX512F-ONLY-NEXT: retq 698; 699; AVX512DQ-LABEL: mask_replication_factor3_vf16: 700; AVX512DQ: # %bb.0: 701; AVX512DQ-NEXT: kmovw (%rdi), %k0 702; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 703; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] 704; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 705; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 706; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 707; AVX512DQ-NEXT: movw $1, %ax 708; AVX512DQ-NEXT: kmovw %eax, %k1 709; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 710; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 711; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] 712; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 713; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 714; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] 715; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 716; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 717; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 718; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z} 719; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} 720; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rdx) 721; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%rdx) 722; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) 723; AVX512DQ-NEXT: vzeroupper 724; AVX512DQ-NEXT: retq 725; 726; AVX512BW-LABEL: mask_replication_factor3_vf16: 727; AVX512BW: # %bb.0: 728; AVX512BW-NEXT: kmovw (%rdi), %k1 729; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 730; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] 731; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 732; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 733; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} 734; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] 735; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 736; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 737; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} 738; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] 739; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm0 740; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 741; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} 742; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) 743; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) 744; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) 745; AVX512BW-NEXT: vzeroupper 746; AVX512BW-NEXT: retq 747 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 748 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 749 %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15> 750 %data = call <48 x i32> @llvm.masked.load.v48i32.p0(ptr %in.vec, i32 64, <48 x i1> %tgt.mask, <48 x i32> poison) 751 store <48 x i32> %data, ptr %out.vec, align 64 752 ret void 753} 754 755define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 756; AVX512F-ONLY-LABEL: mask_replication_factor3_vf32: 757; AVX512F-ONLY: # %bb.0: 758; AVX512F-ONLY-NEXT: kmovw (%rdi), %k2 759; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 760; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 761; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] 762; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 763; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k2 764; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1 765; AVX512F-ONLY-NEXT: movw $1, %ax 766; AVX512F-ONLY-NEXT: kmovw %eax, %k2 767; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k2} 768; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k3 769; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] 770; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm3 771; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k2 772; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] 773; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm0 774; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 775; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 776; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 777; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 778; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm1 779; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 780; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm0 781; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k6 782; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z} 783; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k6} {z} 784; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k5} {z} 785; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} 786; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k4} {z} 787; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k2} {z} 788; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 64(%rdx) 789; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 128(%rdx) 790; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) 791; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 256(%rdx) 792; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 320(%rdx) 793; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) 794; AVX512F-ONLY-NEXT: vzeroupper 795; AVX512F-ONLY-NEXT: retq 796; 797; AVX512DQ-LABEL: mask_replication_factor3_vf32: 798; AVX512DQ: # %bb.0: 799; AVX512DQ-NEXT: kmovw (%rdi), %k1 800; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 801; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 802; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] 803; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 804; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 805; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2 806; AVX512DQ-NEXT: movw $1, %ax 807; AVX512DQ-NEXT: kmovw %eax, %k1 808; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} 809; AVX512DQ-NEXT: vpmovd2m %zmm2, %k2 810; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] 811; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm3 812; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 813; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] 814; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm0 815; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 816; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 817; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 818; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 819; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm1 820; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 821; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm0 822; AVX512DQ-NEXT: vpmovd2m %zmm0, %k6 823; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} 824; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k6} {z} 825; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k5} {z} 826; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} 827; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k3} {z} 828; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} 829; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx) 830; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rdx) 831; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) 832; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rdx) 833; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rdx) 834; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) 835; AVX512DQ-NEXT: vzeroupper 836; AVX512DQ-NEXT: retq 837; 838; AVX512BW-LABEL: mask_replication_factor3_vf32: 839; AVX512BW: # %bb.0: 840; AVX512BW-NEXT: kmovd (%rdi), %k0 841; AVX512BW-NEXT: kshiftrd $1, %k0, %k1 842; AVX512BW-NEXT: movw $-3, %ax 843; AVX512BW-NEXT: kmovd %eax, %k4 844; AVX512BW-NEXT: kmovw (%rdi), %k2 845; AVX512BW-NEXT: kandw %k4, %k2, %k3 846; AVX512BW-NEXT: kmovq %k4, %k7 847; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 848; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 849; AVX512BW-NEXT: korw %k4, %k3, %k3 850; AVX512BW-NEXT: movw $-5, %ax 851; AVX512BW-NEXT: kmovd %eax, %k4 852; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 853; AVX512BW-NEXT: kandw %k4, %k3, %k3 854; AVX512BW-NEXT: kshiftrw $13, %k2, %k2 855; AVX512BW-NEXT: korw %k2, %k3, %k2 856; AVX512BW-NEXT: movw $-9, %ax 857; AVX512BW-NEXT: kmovd %eax, %k3 858; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 859; AVX512BW-NEXT: kandw %k3, %k2, %k2 860; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 861; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 862; AVX512BW-NEXT: korw %k3, %k2, %k2 863; AVX512BW-NEXT: movw $-17, %ax 864; AVX512BW-NEXT: kmovd %eax, %k5 865; AVX512BW-NEXT: kandw %k5, %k2, %k2 866; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 867; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 868; AVX512BW-NEXT: korw %k3, %k2, %k2 869; AVX512BW-NEXT: movw $-33, %ax 870; AVX512BW-NEXT: kmovd %eax, %k3 871; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 872; AVX512BW-NEXT: kandw %k3, %k2, %k2 873; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 874; AVX512BW-NEXT: korw %k1, %k2, %k1 875; AVX512BW-NEXT: movw $-65, %ax 876; AVX512BW-NEXT: kmovd %eax, %k2 877; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 878; AVX512BW-NEXT: kandw %k2, %k1, %k1 879; AVX512BW-NEXT: kshiftrd $2, %k0, %k2 880; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 881; AVX512BW-NEXT: kshiftrw $9, %k2, %k3 882; AVX512BW-NEXT: korw %k3, %k1, %k1 883; AVX512BW-NEXT: movw $-129, %ax 884; AVX512BW-NEXT: kmovd %eax, %k3 885; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 886; AVX512BW-NEXT: kandw %k3, %k1, %k1 887; AVX512BW-NEXT: kshiftrw $8, %k2, %k3 888; AVX512BW-NEXT: korw %k3, %k1, %k1 889; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF 890; AVX512BW-NEXT: kmovd %eax, %k3 891; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 892; AVX512BW-NEXT: kandw %k3, %k1, %k1 893; AVX512BW-NEXT: kshiftrw $7, %k2, %k2 894; AVX512BW-NEXT: korw %k2, %k1, %k1 895; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF 896; AVX512BW-NEXT: kmovd %eax, %k2 897; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 898; AVX512BW-NEXT: kandw %k2, %k1, %k1 899; AVX512BW-NEXT: kshiftrd $3, %k0, %k2 900; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 901; AVX512BW-NEXT: kshiftrw $6, %k2, %k3 902; AVX512BW-NEXT: korw %k3, %k1, %k1 903; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF 904; AVX512BW-NEXT: kmovd %eax, %k3 905; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 906; AVX512BW-NEXT: kandw %k3, %k1, %k1 907; AVX512BW-NEXT: kshiftrw $5, %k2, %k3 908; AVX512BW-NEXT: korw %k3, %k1, %k1 909; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF 910; AVX512BW-NEXT: kmovd %eax, %k3 911; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 912; AVX512BW-NEXT: kandw %k3, %k1, %k1 913; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 914; AVX512BW-NEXT: korw %k2, %k1, %k1 915; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF 916; AVX512BW-NEXT: kmovd %eax, %k2 917; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 918; AVX512BW-NEXT: kandw %k2, %k1, %k1 919; AVX512BW-NEXT: kshiftrd $4, %k0, %k4 920; AVX512BW-NEXT: kshiftlw $15, %k4, %k2 921; AVX512BW-NEXT: kshiftrw $3, %k2, %k3 922; AVX512BW-NEXT: korw %k3, %k1, %k1 923; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF 924; AVX512BW-NEXT: kmovd %eax, %k6 925; AVX512BW-NEXT: kandw %k6, %k1, %k1 926; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 927; AVX512BW-NEXT: kshiftrw $2, %k2, %k2 928; AVX512BW-NEXT: korw %k2, %k1, %k1 929; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF 930; AVX512BW-NEXT: kmovd %eax, %k2 931; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 932; AVX512BW-NEXT: kandw %k2, %k1, %k1 933; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 934; AVX512BW-NEXT: korw %k4, %k1, %k1 935; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 936; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 937; AVX512BW-NEXT: kshiftrd $5, %k0, %k2 938; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 939; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 940; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 941; AVX512BW-NEXT: korw %k2, %k1, %k1 942; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 943; AVX512BW-NEXT: kshiftrd $27, %k0, %k1 944; AVX512BW-NEXT: kshiftlw $15, %k1, %k4 945; AVX512BW-NEXT: kshiftrd $26, %k0, %k1 946; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 947; AVX512BW-NEXT: kmovq %k7, %k2 948; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 949; AVX512BW-NEXT: kandw %k7, %k1, %k1 950; AVX512BW-NEXT: kshiftrw $14, %k4, %k7 951; AVX512BW-NEXT: korw %k7, %k1, %k1 952; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 953; AVX512BW-NEXT: kandw %k3, %k1, %k1 954; AVX512BW-NEXT: kshiftrw $13, %k4, %k7 955; AVX512BW-NEXT: korw %k7, %k1, %k1 956; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 957; AVX512BW-NEXT: kandw %k3, %k1, %k1 958; AVX512BW-NEXT: kshiftrw $12, %k4, %k4 959; AVX512BW-NEXT: korw %k4, %k1, %k1 960; AVX512BW-NEXT: kandw %k5, %k1, %k1 961; AVX512BW-NEXT: kshiftrd $28, %k0, %k4 962; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 963; AVX512BW-NEXT: kshiftrw $11, %k4, %k7 964; AVX512BW-NEXT: korw %k7, %k1, %k1 965; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 966; AVX512BW-NEXT: kandw %k3, %k1, %k1 967; AVX512BW-NEXT: kshiftrw $10, %k4, %k7 968; AVX512BW-NEXT: korw %k7, %k1, %k1 969; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 970; AVX512BW-NEXT: kandw %k5, %k1, %k1 971; AVX512BW-NEXT: kshiftrw $9, %k4, %k4 972; AVX512BW-NEXT: korw %k4, %k1, %k1 973; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 974; AVX512BW-NEXT: kandw %k3, %k1, %k1 975; AVX512BW-NEXT: kshiftrd $29, %k0, %k4 976; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 977; AVX512BW-NEXT: kshiftrw $8, %k4, %k7 978; AVX512BW-NEXT: korw %k7, %k1, %k1 979; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 980; AVX512BW-NEXT: kandw %k3, %k1, %k1 981; AVX512BW-NEXT: kshiftrw $7, %k4, %k7 982; AVX512BW-NEXT: korw %k7, %k1, %k1 983; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 984; AVX512BW-NEXT: kandw %k3, %k1, %k1 985; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 986; AVX512BW-NEXT: korw %k4, %k1, %k1 987; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 988; AVX512BW-NEXT: kandw %k3, %k1, %k1 989; AVX512BW-NEXT: kshiftrd $30, %k0, %k4 990; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 991; AVX512BW-NEXT: kshiftrw $5, %k4, %k7 992; AVX512BW-NEXT: korw %k7, %k1, %k1 993; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 994; AVX512BW-NEXT: kandw %k3, %k1, %k1 995; AVX512BW-NEXT: kshiftrw $4, %k4, %k7 996; AVX512BW-NEXT: korw %k7, %k1, %k1 997; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 998; AVX512BW-NEXT: kandw %k3, %k1, %k1 999; AVX512BW-NEXT: kshiftrw $3, %k4, %k4 1000; AVX512BW-NEXT: korw %k4, %k1, %k1 1001; AVX512BW-NEXT: kandw %k6, %k1, %k1 1002; AVX512BW-NEXT: kshiftrd $31, %k0, %k4 1003; AVX512BW-NEXT: kshiftlw $15, %k4, %k7 1004; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 1005; AVX512BW-NEXT: korw %k6, %k1, %k1 1006; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 1007; AVX512BW-NEXT: kandw %k3, %k1, %k1 1008; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 1009; AVX512BW-NEXT: korw %k4, %k1, %k1 1010; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 1011; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 1012; AVX512BW-NEXT: korw %k7, %k1, %k1 1013; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} 1014; AVX512BW-NEXT: kshiftrd $21, %k0, %k1 1015; AVX512BW-NEXT: kandw %k2, %k1, %k6 1016; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 1017; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 1018; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 1019; AVX512BW-NEXT: korw %k1, %k6, %k1 1020; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1021; AVX512BW-NEXT: kandw %k2, %k1, %k1 1022; AVX512BW-NEXT: kshiftrd $22, %k0, %k6 1023; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 1024; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 1025; AVX512BW-NEXT: korw %k7, %k1, %k1 1026; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 1027; AVX512BW-NEXT: kandw %k3, %k1, %k1 1028; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 1029; AVX512BW-NEXT: korw %k7, %k1, %k1 1030; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1031; AVX512BW-NEXT: kandw %k2, %k1, %k1 1032; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 1033; AVX512BW-NEXT: korw %k6, %k1, %k1 1034; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 1035; AVX512BW-NEXT: kandw %k4, %k1, %k1 1036; AVX512BW-NEXT: kshiftrd $23, %k0, %k6 1037; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 1038; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 1039; AVX512BW-NEXT: korw %k7, %k1, %k1 1040; AVX512BW-NEXT: kandw %k5, %k1, %k1 1041; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 1042; AVX512BW-NEXT: korw %k7, %k1, %k1 1043; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 1044; AVX512BW-NEXT: kandw %k5, %k1, %k1 1045; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 1046; AVX512BW-NEXT: korw %k6, %k1, %k1 1047; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1048; AVX512BW-NEXT: kandw %k2, %k1, %k1 1049; AVX512BW-NEXT: kshiftrd $24, %k0, %k6 1050; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 1051; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 1052; AVX512BW-NEXT: korw %k7, %k1, %k1 1053; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1054; AVX512BW-NEXT: kandw %k2, %k1, %k1 1055; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 1056; AVX512BW-NEXT: korw %k7, %k1, %k1 1057; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1058; AVX512BW-NEXT: kandw %k2, %k1, %k1 1059; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 1060; AVX512BW-NEXT: korw %k6, %k1, %k1 1061; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1062; AVX512BW-NEXT: kandw %k2, %k1, %k1 1063; AVX512BW-NEXT: kshiftrd $25, %k0, %k6 1064; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 1065; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 1066; AVX512BW-NEXT: korw %k7, %k1, %k1 1067; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1068; AVX512BW-NEXT: kandw %k2, %k1, %k1 1069; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 1070; AVX512BW-NEXT: korw %k7, %k1, %k1 1071; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1072; AVX512BW-NEXT: kandw %k2, %k1, %k1 1073; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 1074; AVX512BW-NEXT: korw %k6, %k1, %k1 1075; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1076; AVX512BW-NEXT: kandw %k2, %k1, %k1 1077; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload 1078; AVX512BW-NEXT: kshiftlw $14, %k2, %k6 1079; AVX512BW-NEXT: korw %k6, %k1, %k1 1080; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 1081; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 1082; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 1083; AVX512BW-NEXT: korw %k2, %k1, %k1 1084; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z} 1085; AVX512BW-NEXT: kshiftrd $16, %k0, %k1 1086; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1087; AVX512BW-NEXT: kandw %k2, %k1, %k2 1088; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 1089; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 1090; AVX512BW-NEXT: korw %k6, %k2, %k2 1091; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 1092; AVX512BW-NEXT: kandw %k6, %k2, %k2 1093; AVX512BW-NEXT: kshiftrw $13, %k1, %k1 1094; AVX512BW-NEXT: korw %k1, %k2, %k1 1095; AVX512BW-NEXT: kandw %k3, %k1, %k1 1096; AVX512BW-NEXT: kshiftrd $17, %k0, %k2 1097; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 1098; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 1099; AVX512BW-NEXT: korw %k6, %k1, %k1 1100; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 1101; AVX512BW-NEXT: kandw %k3, %k1, %k1 1102; AVX512BW-NEXT: kshiftrw $11, %k2, %k6 1103; AVX512BW-NEXT: korw %k6, %k1, %k1 1104; AVX512BW-NEXT: kandw %k4, %k1, %k1 1105; AVX512BW-NEXT: kshiftrw $10, %k2, %k2 1106; AVX512BW-NEXT: korw %k2, %k1, %k1 1107; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1108; AVX512BW-NEXT: kandw %k2, %k1, %k1 1109; AVX512BW-NEXT: kshiftrd $18, %k0, %k2 1110; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 1111; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 1112; AVX512BW-NEXT: korw %k6, %k1, %k1 1113; AVX512BW-NEXT: kandw %k5, %k1, %k1 1114; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 1115; AVX512BW-NEXT: korw %k6, %k1, %k1 1116; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 1117; AVX512BW-NEXT: kandw %k5, %k1, %k1 1118; AVX512BW-NEXT: kshiftrw $7, %k2, %k2 1119; AVX512BW-NEXT: korw %k2, %k1, %k1 1120; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1121; AVX512BW-NEXT: kandw %k2, %k1, %k1 1122; AVX512BW-NEXT: kshiftrd $19, %k0, %k2 1123; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 1124; AVX512BW-NEXT: kshiftrw $6, %k2, %k6 1125; AVX512BW-NEXT: korw %k6, %k1, %k1 1126; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 1127; AVX512BW-NEXT: kandw %k4, %k1, %k1 1128; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 1129; AVX512BW-NEXT: korw %k6, %k1, %k1 1130; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 1131; AVX512BW-NEXT: kandw %k4, %k1, %k1 1132; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 1133; AVX512BW-NEXT: korw %k2, %k1, %k1 1134; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1135; AVX512BW-NEXT: kandw %k2, %k1, %k1 1136; AVX512BW-NEXT: kshiftrd $20, %k0, %k2 1137; AVX512BW-NEXT: kshiftlw $15, %k2, %k6 1138; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 1139; AVX512BW-NEXT: korw %k7, %k1, %k1 1140; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 1141; AVX512BW-NEXT: kandw %k7, %k1, %k1 1142; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 1143; AVX512BW-NEXT: korw %k6, %k1, %k1 1144; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 1145; AVX512BW-NEXT: kandw %k4, %k1, %k1 1146; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 1147; AVX512BW-NEXT: korw %k2, %k1, %k1 1148; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 1149; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 1150; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1151; AVX512BW-NEXT: korw %k2, %k1, %k1 1152; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} 1153; AVX512BW-NEXT: kshiftrd $11, %k0, %k1 1154; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 1155; AVX512BW-NEXT: kshiftrd $10, %k0, %k4 1156; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1157; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 1158; AVX512BW-NEXT: kandw %k1, %k4, %k4 1159; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 1160; AVX512BW-NEXT: korw %k6, %k4, %k4 1161; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 1162; AVX512BW-NEXT: kandw %k1, %k4, %k4 1163; AVX512BW-NEXT: kshiftrw $13, %k2, %k6 1164; AVX512BW-NEXT: korw %k6, %k4, %k4 1165; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 1166; AVX512BW-NEXT: kandw %k1, %k4, %k4 1167; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 1168; AVX512BW-NEXT: korw %k2, %k4, %k2 1169; AVX512BW-NEXT: kandw %k3, %k2, %k2 1170; AVX512BW-NEXT: kshiftrd $12, %k0, %k4 1171; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 1172; AVX512BW-NEXT: kshiftrw $11, %k4, %k6 1173; AVX512BW-NEXT: korw %k6, %k2, %k2 1174; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 1175; AVX512BW-NEXT: kandw %k1, %k2, %k2 1176; AVX512BW-NEXT: kshiftrw $10, %k4, %k6 1177; AVX512BW-NEXT: korw %k6, %k2, %k2 1178; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 1179; AVX512BW-NEXT: kandw %k1, %k2, %k2 1180; AVX512BW-NEXT: kshiftrw $9, %k4, %k4 1181; AVX512BW-NEXT: korw %k4, %k2, %k2 1182; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 1183; AVX512BW-NEXT: kandw %k1, %k2, %k2 1184; AVX512BW-NEXT: kshiftrd $13, %k0, %k4 1185; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 1186; AVX512BW-NEXT: kshiftrw $8, %k4, %k6 1187; AVX512BW-NEXT: korw %k6, %k2, %k2 1188; AVX512BW-NEXT: kandw %k5, %k2, %k2 1189; AVX512BW-NEXT: kshiftrw $7, %k4, %k6 1190; AVX512BW-NEXT: korw %k6, %k2, %k2 1191; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 1192; AVX512BW-NEXT: kandw %k3, %k2, %k2 1193; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 1194; AVX512BW-NEXT: korw %k4, %k2, %k2 1195; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 1196; AVX512BW-NEXT: kandw %k1, %k2, %k2 1197; AVX512BW-NEXT: kshiftrd $14, %k0, %k4 1198; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 1199; AVX512BW-NEXT: kshiftrw $5, %k4, %k6 1200; AVX512BW-NEXT: korw %k6, %k2, %k2 1201; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 1202; AVX512BW-NEXT: kandw %k1, %k2, %k2 1203; AVX512BW-NEXT: kshiftrw $4, %k4, %k6 1204; AVX512BW-NEXT: korw %k6, %k2, %k2 1205; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 1206; AVX512BW-NEXT: kandw %k5, %k2, %k2 1207; AVX512BW-NEXT: kshiftrw $3, %k4, %k4 1208; AVX512BW-NEXT: korw %k4, %k2, %k2 1209; AVX512BW-NEXT: kandw %k7, %k2, %k2 1210; AVX512BW-NEXT: kshiftrd $15, %k0, %k4 1211; AVX512BW-NEXT: kshiftlw $15, %k4, %k6 1212; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 1213; AVX512BW-NEXT: korw %k7, %k2, %k2 1214; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 1215; AVX512BW-NEXT: kandw %k7, %k2, %k2 1216; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 1217; AVX512BW-NEXT: korw %k4, %k2, %k2 1218; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 1219; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 1220; AVX512BW-NEXT: korw %k6, %k2, %k2 1221; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k2} {z} 1222; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload 1223; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 1224; AVX512BW-NEXT: kandw %k4, %k2, %k2 1225; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 1226; AVX512BW-NEXT: kshiftrw $14, %k4, %k4 1227; AVX512BW-NEXT: korw %k4, %k2, %k2 1228; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 1229; AVX512BW-NEXT: kandw %k4, %k2, %k2 1230; AVX512BW-NEXT: kshiftrd $6, %k0, %k4 1231; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 1232; AVX512BW-NEXT: kshiftrw $13, %k4, %k6 1233; AVX512BW-NEXT: korw %k6, %k2, %k2 1234; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 1235; AVX512BW-NEXT: kandw %k6, %k2, %k2 1236; AVX512BW-NEXT: kshiftrw $12, %k4, %k6 1237; AVX512BW-NEXT: korw %k6, %k2, %k2 1238; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 1239; AVX512BW-NEXT: kandw %k6, %k2, %k2 1240; AVX512BW-NEXT: kshiftrw $11, %k4, %k4 1241; AVX512BW-NEXT: korw %k4, %k2, %k2 1242; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 1243; AVX512BW-NEXT: kandw %k4, %k2, %k2 1244; AVX512BW-NEXT: kshiftrd $7, %k0, %k4 1245; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 1246; AVX512BW-NEXT: kshiftrw $10, %k4, %k6 1247; AVX512BW-NEXT: korw %k6, %k2, %k2 1248; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 1249; AVX512BW-NEXT: kandw %k6, %k2, %k2 1250; AVX512BW-NEXT: kshiftrw $9, %k4, %k6 1251; AVX512BW-NEXT: korw %k6, %k2, %k2 1252; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 1253; AVX512BW-NEXT: kandw %k6, %k2, %k2 1254; AVX512BW-NEXT: kshiftrw $8, %k4, %k4 1255; AVX512BW-NEXT: korw %k4, %k2, %k2 1256; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 1257; AVX512BW-NEXT: kandw %k4, %k2, %k2 1258; AVX512BW-NEXT: kshiftrd $8, %k0, %k4 1259; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 1260; AVX512BW-NEXT: kshiftrw $7, %k4, %k6 1261; AVX512BW-NEXT: korw %k6, %k2, %k2 1262; AVX512BW-NEXT: kandw %k3, %k2, %k2 1263; AVX512BW-NEXT: kshiftrw $6, %k4, %k6 1264; AVX512BW-NEXT: korw %k6, %k2, %k2 1265; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 1266; AVX512BW-NEXT: kandw %k3, %k2, %k2 1267; AVX512BW-NEXT: kshiftrw $5, %k4, %k4 1268; AVX512BW-NEXT: korw %k4, %k2, %k2 1269; AVX512BW-NEXT: kshiftrd $9, %k0, %k0 1270; AVX512BW-NEXT: kandw %k1, %k2, %k2 1271; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 1272; AVX512BW-NEXT: kshiftrw $4, %k0, %k4 1273; AVX512BW-NEXT: korw %k4, %k2, %k2 1274; AVX512BW-NEXT: kandw %k5, %k2, %k2 1275; AVX512BW-NEXT: kshiftrw $3, %k0, %k4 1276; AVX512BW-NEXT: korw %k4, %k2, %k2 1277; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 1278; AVX512BW-NEXT: kandw %k1, %k2, %k2 1279; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 1280; AVX512BW-NEXT: korw %k0, %k2, %k0 1281; AVX512BW-NEXT: kandw %k7, %k0, %k0 1282; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload 1283; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 1284; AVX512BW-NEXT: korw %k2, %k0, %k0 1285; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 1286; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 1287; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 1288; AVX512BW-NEXT: korw %k1, %k0, %k1 1289; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} 1290; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rdx) 1291; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rdx) 1292; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) 1293; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%rdx) 1294; AVX512BW-NEXT: vmovdqa64 %zmm1, 320(%rdx) 1295; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) 1296; AVX512BW-NEXT: vzeroupper 1297; AVX512BW-NEXT: retq 1298 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 1299 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1300 %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31> 1301 %data = call <96 x i32> @llvm.masked.load.v96i32.p0(ptr %in.vec, i32 64, <96 x i1> %tgt.mask, <96 x i32> poison) 1302 store <96 x i32> %data, ptr %out.vec, align 64 1303 ret void 1304} 1305 1306define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 1307; AVX512F-ONLY-LABEL: mask_replication_factor3_vf64: 1308; AVX512F-ONLY: # %bb.0: 1309; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 1310; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 1311; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] 1312; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 1313; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 1314; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1 1315; AVX512F-ONLY-NEXT: movw $1, %ax 1316; AVX512F-ONLY-NEXT: kmovw %eax, %k1 1317; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} 1318; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 1319; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 1320; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 1321; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm4 {%k1} {z} = -1 1322; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 1323; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm5 {%k1} {z} = -1 1324; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 1325; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] 1326; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm6 1327; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] 1328; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm0 1329; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm8 1330; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm9 1331; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm3 1332; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm1, %zmm10 1333; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm2, %zmm11 1334; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm4 1335; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm1, %zmm1 1336; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm2, %zmm2 1337; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm5 1338; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm7 {%k1} {z} 1339; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 1340; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm5 {%k1} {z} 1341; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 1342; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k1} {z} 1343; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 1344; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm1 {%k1} {z} 1345; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 1346; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k1} {z} 1347; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 1348; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm11 {%k1} {z} 1349; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 1350; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm10 {%k1} {z} 1351; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 1352; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} 1353; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 1354; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z} 1355; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 1356; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} 1357; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 1358; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm0 {%k1} {z} 1359; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 1360; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm6 {%k1} {z} 1361; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 64(%rdx) 1362; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 128(%rdx) 1363; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx) 1364; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 256(%rdx) 1365; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 320(%rdx) 1366; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 384(%rdx) 1367; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 448(%rdx) 1368; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 512(%rdx) 1369; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 576(%rdx) 1370; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 640(%rdx) 1371; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 704(%rdx) 1372; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, (%rdx) 1373; AVX512F-ONLY-NEXT: vzeroupper 1374; AVX512F-ONLY-NEXT: retq 1375; 1376; AVX512DQ-LABEL: mask_replication_factor3_vf64: 1377; AVX512DQ: # %bb.0: 1378; AVX512DQ-NEXT: kmovw (%rdi), %k0 1379; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 1380; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] 1381; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 1382; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 1383; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 1384; AVX512DQ-NEXT: movw $1, %ax 1385; AVX512DQ-NEXT: kmovw %eax, %k1 1386; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} 1387; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 1388; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 1389; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 1390; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 1391; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 1392; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 1393; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 1394; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] 1395; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm6 1396; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] 1397; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm0 1398; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm8 1399; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm9 1400; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm3 1401; AVX512DQ-NEXT: vpermd %zmm4, %zmm1, %zmm10 1402; AVX512DQ-NEXT: vpermd %zmm4, %zmm2, %zmm11 1403; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm4 1404; AVX512DQ-NEXT: vpermd %zmm5, %zmm1, %zmm1 1405; AVX512DQ-NEXT: vpermd %zmm5, %zmm2, %zmm2 1406; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm5 1407; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm7 {%k1} {z} 1408; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 1409; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm5 {%k1} {z} 1410; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 1411; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k1} {z} 1412; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 1413; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm1 {%k1} {z} 1414; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 1415; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k1} {z} 1416; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 1417; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm11 {%k1} {z} 1418; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 1419; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm10 {%k1} {z} 1420; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 1421; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} 1422; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 1423; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z} 1424; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 1425; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} 1426; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 1427; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm0 {%k1} {z} 1428; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 1429; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm6 {%k1} {z} 1430; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rdx) 1431; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rdx) 1432; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx) 1433; AVX512DQ-NEXT: vmovdqa64 %zmm9, 256(%rdx) 1434; AVX512DQ-NEXT: vmovdqa64 %zmm3, 320(%rdx) 1435; AVX512DQ-NEXT: vmovdqa64 %zmm10, 384(%rdx) 1436; AVX512DQ-NEXT: vmovdqa64 %zmm11, 448(%rdx) 1437; AVX512DQ-NEXT: vmovdqa64 %zmm4, 512(%rdx) 1438; AVX512DQ-NEXT: vmovdqa64 %zmm1, 576(%rdx) 1439; AVX512DQ-NEXT: vmovdqa64 %zmm2, 640(%rdx) 1440; AVX512DQ-NEXT: vmovdqa64 %zmm5, 704(%rdx) 1441; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rdx) 1442; AVX512DQ-NEXT: vzeroupper 1443; AVX512DQ-NEXT: retq 1444; 1445; AVX512BW-LABEL: mask_replication_factor3_vf64: 1446; AVX512BW: # %bb.0: 1447; AVX512BW-NEXT: kmovq (%rdi), %k0 1448; AVX512BW-NEXT: kshiftrq $1, %k0, %k1 1449; AVX512BW-NEXT: movw $-3, %ax 1450; AVX512BW-NEXT: kmovd %eax, %k4 1451; AVX512BW-NEXT: kmovw (%rdi), %k2 1452; AVX512BW-NEXT: kandw %k4, %k2, %k3 1453; AVX512BW-NEXT: kmovq %k4, %k7 1454; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 1455; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 1456; AVX512BW-NEXT: korw %k4, %k3, %k3 1457; AVX512BW-NEXT: movw $-5, %ax 1458; AVX512BW-NEXT: kmovd %eax, %k4 1459; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 1460; AVX512BW-NEXT: kandw %k4, %k3, %k3 1461; AVX512BW-NEXT: kshiftrw $13, %k2, %k2 1462; AVX512BW-NEXT: korw %k2, %k3, %k2 1463; AVX512BW-NEXT: movw $-9, %ax 1464; AVX512BW-NEXT: kmovd %eax, %k3 1465; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 1466; AVX512BW-NEXT: kandw %k3, %k2, %k2 1467; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 1468; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 1469; AVX512BW-NEXT: korw %k3, %k2, %k2 1470; AVX512BW-NEXT: movw $-17, %ax 1471; AVX512BW-NEXT: kmovd %eax, %k3 1472; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 1473; AVX512BW-NEXT: kandw %k3, %k2, %k2 1474; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 1475; AVX512BW-NEXT: korw %k3, %k2, %k2 1476; AVX512BW-NEXT: movw $-33, %ax 1477; AVX512BW-NEXT: kmovd %eax, %k3 1478; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 1479; AVX512BW-NEXT: kandw %k3, %k2, %k2 1480; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 1481; AVX512BW-NEXT: korw %k1, %k2, %k1 1482; AVX512BW-NEXT: movw $-65, %ax 1483; AVX512BW-NEXT: kmovd %eax, %k2 1484; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 1485; AVX512BW-NEXT: kandw %k2, %k1, %k1 1486; AVX512BW-NEXT: kshiftrq $2, %k0, %k2 1487; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 1488; AVX512BW-NEXT: kshiftrw $9, %k2, %k3 1489; AVX512BW-NEXT: korw %k3, %k1, %k1 1490; AVX512BW-NEXT: movw $-129, %ax 1491; AVX512BW-NEXT: kmovd %eax, %k3 1492; AVX512BW-NEXT: kandw %k3, %k1, %k1 1493; AVX512BW-NEXT: kmovq %k3, %k5 1494; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 1495; AVX512BW-NEXT: kshiftrw $8, %k2, %k3 1496; AVX512BW-NEXT: korw %k3, %k1, %k1 1497; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF 1498; AVX512BW-NEXT: kmovd %eax, %k3 1499; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 1500; AVX512BW-NEXT: kandw %k3, %k1, %k1 1501; AVX512BW-NEXT: kshiftrw $7, %k2, %k2 1502; AVX512BW-NEXT: korw %k2, %k1, %k1 1503; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF 1504; AVX512BW-NEXT: kmovd %eax, %k2 1505; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 1506; AVX512BW-NEXT: kandw %k2, %k1, %k1 1507; AVX512BW-NEXT: kshiftrq $3, %k0, %k2 1508; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 1509; AVX512BW-NEXT: kshiftrw $6, %k2, %k3 1510; AVX512BW-NEXT: korw %k3, %k1, %k1 1511; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF 1512; AVX512BW-NEXT: kmovd %eax, %k3 1513; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 1514; AVX512BW-NEXT: kandw %k3, %k1, %k1 1515; AVX512BW-NEXT: kshiftrw $5, %k2, %k3 1516; AVX512BW-NEXT: korw %k3, %k1, %k1 1517; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF 1518; AVX512BW-NEXT: kmovd %eax, %k3 1519; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 1520; AVX512BW-NEXT: kandw %k3, %k1, %k1 1521; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 1522; AVX512BW-NEXT: korw %k2, %k1, %k1 1523; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF 1524; AVX512BW-NEXT: kmovd %eax, %k2 1525; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 1526; AVX512BW-NEXT: kandw %k2, %k1, %k1 1527; AVX512BW-NEXT: kshiftrq $4, %k0, %k2 1528; AVX512BW-NEXT: kshiftlw $15, %k2, %k3 1529; AVX512BW-NEXT: kshiftrw $3, %k3, %k4 1530; AVX512BW-NEXT: korw %k4, %k1, %k1 1531; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF 1532; AVX512BW-NEXT: kmovd %eax, %k6 1533; AVX512BW-NEXT: kandw %k6, %k1, %k1 1534; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 1535; AVX512BW-NEXT: kshiftrw $2, %k3, %k3 1536; AVX512BW-NEXT: korw %k3, %k1, %k1 1537; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF 1538; AVX512BW-NEXT: kmovd %eax, %k3 1539; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 1540; AVX512BW-NEXT: kandw %k3, %k1, %k1 1541; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 1542; AVX512BW-NEXT: korw %k2, %k1, %k1 1543; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 1544; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 1545; AVX512BW-NEXT: kshiftrq $5, %k0, %k2 1546; AVX512BW-NEXT: kmovq %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1547; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 1548; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 1549; AVX512BW-NEXT: korw %k2, %k1, %k1 1550; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 1551; AVX512BW-NEXT: kshiftrq $59, %k0, %k1 1552; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 1553; AVX512BW-NEXT: kshiftrq $58, %k0, %k1 1554; AVX512BW-NEXT: kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1555; AVX512BW-NEXT: kmovq %k7, %k3 1556; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 1557; AVX512BW-NEXT: kandw %k7, %k1, %k1 1558; AVX512BW-NEXT: kshiftrw $14, %k2, %k7 1559; AVX512BW-NEXT: korw %k7, %k1, %k1 1560; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 1561; AVX512BW-NEXT: kandw %k4, %k1, %k1 1562; AVX512BW-NEXT: kshiftrw $13, %k2, %k7 1563; AVX512BW-NEXT: korw %k7, %k1, %k1 1564; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 1565; AVX512BW-NEXT: kandw %k4, %k1, %k1 1566; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 1567; AVX512BW-NEXT: korw %k2, %k1, %k1 1568; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 1569; AVX512BW-NEXT: kandw %k4, %k1, %k1 1570; AVX512BW-NEXT: kshiftrq $60, %k0, %k2 1571; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 1572; AVX512BW-NEXT: kshiftrw $11, %k2, %k7 1573; AVX512BW-NEXT: korw %k7, %k1, %k1 1574; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 1575; AVX512BW-NEXT: kandw %k7, %k1, %k1 1576; AVX512BW-NEXT: kshiftrw $10, %k2, %k7 1577; AVX512BW-NEXT: korw %k7, %k1, %k1 1578; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 1579; AVX512BW-NEXT: kandw %k7, %k1, %k1 1580; AVX512BW-NEXT: kshiftrw $9, %k2, %k2 1581; AVX512BW-NEXT: korw %k2, %k1, %k1 1582; AVX512BW-NEXT: kandw %k5, %k1, %k1 1583; AVX512BW-NEXT: kshiftrq $61, %k0, %k2 1584; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 1585; AVX512BW-NEXT: kshiftrw $8, %k2, %k7 1586; AVX512BW-NEXT: korw %k7, %k1, %k1 1587; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 1588; AVX512BW-NEXT: kandw %k5, %k1, %k1 1589; AVX512BW-NEXT: kshiftrw $7, %k2, %k7 1590; AVX512BW-NEXT: korw %k7, %k1, %k1 1591; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 1592; AVX512BW-NEXT: kandw %k7, %k1, %k1 1593; AVX512BW-NEXT: kshiftrw $6, %k2, %k2 1594; AVX512BW-NEXT: korw %k2, %k1, %k1 1595; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1596; AVX512BW-NEXT: kandw %k2, %k1, %k1 1597; AVX512BW-NEXT: kshiftrq $62, %k0, %k2 1598; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 1599; AVX512BW-NEXT: kshiftrw $5, %k2, %k7 1600; AVX512BW-NEXT: korw %k7, %k1, %k1 1601; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 1602; AVX512BW-NEXT: kandw %k7, %k1, %k1 1603; AVX512BW-NEXT: kshiftrw $4, %k2, %k7 1604; AVX512BW-NEXT: korw %k7, %k1, %k1 1605; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 1606; AVX512BW-NEXT: kandw %k7, %k1, %k1 1607; AVX512BW-NEXT: kshiftrw $3, %k2, %k2 1608; AVX512BW-NEXT: korw %k2, %k1, %k1 1609; AVX512BW-NEXT: kandw %k6, %k1, %k1 1610; AVX512BW-NEXT: kshiftrq $63, %k0, %k2 1611; AVX512BW-NEXT: kshiftlw $15, %k2, %k7 1612; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 1613; AVX512BW-NEXT: korw %k6, %k1, %k1 1614; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 1615; AVX512BW-NEXT: kandw %k6, %k1, %k1 1616; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 1617; AVX512BW-NEXT: korw %k2, %k1, %k1 1618; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 1619; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 1620; AVX512BW-NEXT: korw %k7, %k1, %k1 1621; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z} 1622; AVX512BW-NEXT: kshiftrq $53, %k0, %k1 1623; AVX512BW-NEXT: kandw %k3, %k1, %k6 1624; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 1625; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 1626; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 1627; AVX512BW-NEXT: korw %k1, %k6, %k1 1628; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1629; AVX512BW-NEXT: kandw %k2, %k1, %k1 1630; AVX512BW-NEXT: kshiftrq $54, %k0, %k6 1631; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 1632; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 1633; AVX512BW-NEXT: korw %k7, %k1, %k1 1634; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1635; AVX512BW-NEXT: kandw %k2, %k1, %k1 1636; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 1637; AVX512BW-NEXT: korw %k7, %k1, %k1 1638; AVX512BW-NEXT: kandw %k4, %k1, %k1 1639; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 1640; AVX512BW-NEXT: korw %k6, %k1, %k1 1641; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1642; AVX512BW-NEXT: kandw %k2, %k1, %k1 1643; AVX512BW-NEXT: kshiftrq $55, %k0, %k6 1644; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 1645; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 1646; AVX512BW-NEXT: korw %k7, %k1, %k1 1647; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1648; AVX512BW-NEXT: kandw %k2, %k1, %k1 1649; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 1650; AVX512BW-NEXT: korw %k7, %k1, %k1 1651; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1652; AVX512BW-NEXT: kandw %k2, %k1, %k1 1653; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 1654; AVX512BW-NEXT: korw %k6, %k1, %k1 1655; AVX512BW-NEXT: kandw %k5, %k1, %k1 1656; AVX512BW-NEXT: kshiftrq $56, %k0, %k6 1657; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 1658; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 1659; AVX512BW-NEXT: korw %k7, %k1, %k1 1660; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 1661; AVX512BW-NEXT: kandw %k5, %k1, %k1 1662; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 1663; AVX512BW-NEXT: korw %k7, %k1, %k1 1664; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 1665; AVX512BW-NEXT: kandw %k3, %k1, %k1 1666; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 1667; AVX512BW-NEXT: korw %k6, %k1, %k1 1668; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 1669; AVX512BW-NEXT: kandw %k3, %k1, %k1 1670; AVX512BW-NEXT: kshiftrq $57, %k0, %k6 1671; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 1672; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 1673; AVX512BW-NEXT: korw %k7, %k1, %k1 1674; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 1675; AVX512BW-NEXT: kandw %k3, %k1, %k1 1676; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 1677; AVX512BW-NEXT: korw %k7, %k1, %k1 1678; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 1679; AVX512BW-NEXT: kandw %k3, %k1, %k1 1680; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 1681; AVX512BW-NEXT: korw %k6, %k1, %k1 1682; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 1683; AVX512BW-NEXT: kandw %k4, %k1, %k1 1684; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 8-byte Reload 1685; AVX512BW-NEXT: kshiftlw $14, %k3, %k6 1686; AVX512BW-NEXT: korw %k6, %k1, %k1 1687; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 1688; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 1689; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 1690; AVX512BW-NEXT: korw %k3, %k1, %k1 1691; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k1} {z} 1692; AVX512BW-NEXT: kshiftrq $48, %k0, %k1 1693; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 1694; AVX512BW-NEXT: kandw %k3, %k1, %k3 1695; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 1696; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 1697; AVX512BW-NEXT: korw %k6, %k3, %k3 1698; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 1699; AVX512BW-NEXT: kandw %k6, %k3, %k3 1700; AVX512BW-NEXT: kshiftrw $13, %k1, %k1 1701; AVX512BW-NEXT: korw %k1, %k3, %k1 1702; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 1703; AVX512BW-NEXT: kandw %k3, %k1, %k1 1704; AVX512BW-NEXT: kshiftrq $49, %k0, %k3 1705; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 1706; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 1707; AVX512BW-NEXT: korw %k6, %k1, %k1 1708; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 1709; AVX512BW-NEXT: kandw %k6, %k1, %k1 1710; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 1711; AVX512BW-NEXT: korw %k6, %k1, %k1 1712; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 1713; AVX512BW-NEXT: kandw %k6, %k1, %k1 1714; AVX512BW-NEXT: kshiftrw $10, %k3, %k3 1715; AVX512BW-NEXT: korw %k3, %k1, %k1 1716; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 1717; AVX512BW-NEXT: kandw %k3, %k1, %k1 1718; AVX512BW-NEXT: kshiftrq $50, %k0, %k3 1719; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 1720; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 1721; AVX512BW-NEXT: korw %k6, %k1, %k1 1722; AVX512BW-NEXT: kandw %k2, %k1, %k1 1723; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 1724; AVX512BW-NEXT: korw %k6, %k1, %k1 1725; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1726; AVX512BW-NEXT: kandw %k2, %k1, %k1 1727; AVX512BW-NEXT: kshiftrw $7, %k3, %k3 1728; AVX512BW-NEXT: korw %k3, %k1, %k1 1729; AVX512BW-NEXT: kandw %k5, %k1, %k1 1730; AVX512BW-NEXT: kshiftrq $51, %k0, %k3 1731; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 1732; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 1733; AVX512BW-NEXT: korw %k6, %k1, %k1 1734; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 1735; AVX512BW-NEXT: kandw %k5, %k1, %k1 1736; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 1737; AVX512BW-NEXT: korw %k6, %k1, %k1 1738; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1739; AVX512BW-NEXT: kandw %k2, %k1, %k1 1740; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 1741; AVX512BW-NEXT: korw %k3, %k1, %k1 1742; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1743; AVX512BW-NEXT: kandw %k2, %k1, %k1 1744; AVX512BW-NEXT: kshiftrq $52, %k0, %k3 1745; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 1746; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 1747; AVX512BW-NEXT: korw %k7, %k1, %k1 1748; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 1749; AVX512BW-NEXT: kandw %k7, %k1, %k1 1750; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 1751; AVX512BW-NEXT: korw %k6, %k1, %k1 1752; AVX512BW-NEXT: kandw %k4, %k1, %k1 1753; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 1754; AVX512BW-NEXT: korw %k3, %k1, %k1 1755; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 1756; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 1757; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1758; AVX512BW-NEXT: korw %k2, %k1, %k1 1759; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} 1760; AVX512BW-NEXT: kshiftrq $43, %k0, %k1 1761; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 1762; AVX512BW-NEXT: kshiftrq $42, %k0, %k1 1763; AVX512BW-NEXT: kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1764; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 1765; AVX512BW-NEXT: kandw %k4, %k1, %k3 1766; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 1767; AVX512BW-NEXT: korw %k6, %k3, %k3 1768; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 1769; AVX512BW-NEXT: kandw %k1, %k3, %k3 1770; AVX512BW-NEXT: kshiftrw $13, %k2, %k6 1771; AVX512BW-NEXT: korw %k6, %k3, %k3 1772; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 1773; AVX512BW-NEXT: kandw %k1, %k3, %k3 1774; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 1775; AVX512BW-NEXT: korw %k2, %k3, %k2 1776; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 1777; AVX512BW-NEXT: kandw %k1, %k2, %k2 1778; AVX512BW-NEXT: kshiftrq $44, %k0, %k3 1779; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 1780; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 1781; AVX512BW-NEXT: korw %k6, %k2, %k2 1782; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 1783; AVX512BW-NEXT: kandw %k1, %k2, %k2 1784; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 1785; AVX512BW-NEXT: korw %k6, %k2, %k2 1786; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 1787; AVX512BW-NEXT: kandw %k6, %k2, %k2 1788; AVX512BW-NEXT: kshiftrw $9, %k3, %k3 1789; AVX512BW-NEXT: korw %k3, %k2, %k2 1790; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 1791; AVX512BW-NEXT: kandw %k3, %k2, %k2 1792; AVX512BW-NEXT: kshiftrq $45, %k0, %k3 1793; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 1794; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 1795; AVX512BW-NEXT: korw %k6, %k2, %k2 1796; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 1797; AVX512BW-NEXT: kandw %k6, %k2, %k2 1798; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 1799; AVX512BW-NEXT: korw %k6, %k2, %k2 1800; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 1801; AVX512BW-NEXT: kandw %k6, %k2, %k2 1802; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 1803; AVX512BW-NEXT: korw %k3, %k2, %k2 1804; AVX512BW-NEXT: kandw %k5, %k2, %k2 1805; AVX512BW-NEXT: kshiftrq $46, %k0, %k3 1806; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 1807; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 1808; AVX512BW-NEXT: korw %k6, %k2, %k2 1809; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 1810; AVX512BW-NEXT: kandw %k5, %k2, %k2 1811; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 1812; AVX512BW-NEXT: korw %k6, %k2, %k2 1813; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 1814; AVX512BW-NEXT: kandw %k6, %k2, %k2 1815; AVX512BW-NEXT: kshiftrw $3, %k3, %k3 1816; AVX512BW-NEXT: korw %k3, %k2, %k2 1817; AVX512BW-NEXT: kandw %k7, %k2, %k2 1818; AVX512BW-NEXT: kshiftrq $47, %k0, %k3 1819; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 1820; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 1821; AVX512BW-NEXT: korw %k7, %k2, %k2 1822; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 1823; AVX512BW-NEXT: kandw %k7, %k2, %k2 1824; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 1825; AVX512BW-NEXT: korw %k3, %k2, %k2 1826; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 1827; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 1828; AVX512BW-NEXT: korw %k6, %k2, %k2 1829; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k2} {z} 1830; AVX512BW-NEXT: kshiftrq $37, %k0, %k2 1831; AVX512BW-NEXT: kandw %k4, %k2, %k3 1832; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 1833; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 1834; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 1835; AVX512BW-NEXT: korw %k6, %k3, %k3 1836; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1837; AVX512BW-NEXT: kandw %k2, %k3, %k3 1838; AVX512BW-NEXT: kshiftrq $38, %k0, %k6 1839; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 1840; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 1841; AVX512BW-NEXT: korw %k7, %k3, %k3 1842; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 1843; AVX512BW-NEXT: kandw %k4, %k3, %k3 1844; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 1845; AVX512BW-NEXT: korw %k7, %k3, %k3 1846; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1847; AVX512BW-NEXT: kandw %k2, %k3, %k3 1848; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 1849; AVX512BW-NEXT: korw %k6, %k3, %k3 1850; AVX512BW-NEXT: kandw %k1, %k3, %k3 1851; AVX512BW-NEXT: kshiftrq $39, %k0, %k6 1852; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 1853; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 1854; AVX512BW-NEXT: korw %k7, %k3, %k3 1855; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1856; AVX512BW-NEXT: kandw %k2, %k3, %k3 1857; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 1858; AVX512BW-NEXT: korw %k7, %k3, %k3 1859; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 1860; AVX512BW-NEXT: kandw %k1, %k3, %k3 1861; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 1862; AVX512BW-NEXT: korw %k6, %k3, %k3 1863; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 1864; AVX512BW-NEXT: kandw %k1, %k3, %k3 1865; AVX512BW-NEXT: kshiftrq $40, %k0, %k6 1866; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 1867; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 1868; AVX512BW-NEXT: korw %k7, %k3, %k3 1869; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 1870; AVX512BW-NEXT: kandw %k1, %k3, %k3 1871; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 1872; AVX512BW-NEXT: korw %k7, %k3, %k3 1873; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 1874; AVX512BW-NEXT: kandw %k1, %k3, %k3 1875; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 1876; AVX512BW-NEXT: korw %k6, %k3, %k3 1877; AVX512BW-NEXT: kandw %k5, %k3, %k3 1878; AVX512BW-NEXT: kshiftrq $41, %k0, %k6 1879; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 1880; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 1881; AVX512BW-NEXT: korw %k7, %k3, %k3 1882; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 1883; AVX512BW-NEXT: kandw %k5, %k3, %k3 1884; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 1885; AVX512BW-NEXT: korw %k7, %k3, %k3 1886; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 1887; AVX512BW-NEXT: kandw %k1, %k3, %k3 1888; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 1889; AVX512BW-NEXT: korw %k6, %k3, %k3 1890; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 1891; AVX512BW-NEXT: kandw %k1, %k3, %k3 1892; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 8-byte Reload 1893; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 1894; AVX512BW-NEXT: korw %k6, %k3, %k3 1895; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 1896; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 1897; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 1898; AVX512BW-NEXT: korw %k1, %k3, %k1 1899; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} 1900; AVX512BW-NEXT: kshiftrq $32, %k0, %k1 1901; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 1902; AVX512BW-NEXT: kandw %k3, %k1, %k3 1903; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 1904; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 1905; AVX512BW-NEXT: korw %k6, %k3, %k3 1906; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 1907; AVX512BW-NEXT: kandw %k6, %k3, %k3 1908; AVX512BW-NEXT: kshiftrw $13, %k1, %k1 1909; AVX512BW-NEXT: korw %k1, %k3, %k1 1910; AVX512BW-NEXT: kandw %k4, %k1, %k1 1911; AVX512BW-NEXT: kshiftrq $33, %k0, %k3 1912; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 1913; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 1914; AVX512BW-NEXT: korw %k6, %k1, %k1 1915; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 1916; AVX512BW-NEXT: kandw %k4, %k1, %k1 1917; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 1918; AVX512BW-NEXT: korw %k6, %k1, %k1 1919; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 1920; AVX512BW-NEXT: kandw %k6, %k1, %k1 1921; AVX512BW-NEXT: kshiftrw $10, %k3, %k3 1922; AVX512BW-NEXT: korw %k3, %k1, %k1 1923; AVX512BW-NEXT: kandw %k2, %k1, %k1 1924; AVX512BW-NEXT: kshiftrq $34, %k0, %k3 1925; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 1926; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 1927; AVX512BW-NEXT: korw %k6, %k1, %k1 1928; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1929; AVX512BW-NEXT: kandw %k2, %k1, %k1 1930; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 1931; AVX512BW-NEXT: korw %k6, %k1, %k1 1932; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1933; AVX512BW-NEXT: kandw %k2, %k1, %k1 1934; AVX512BW-NEXT: kshiftrw $7, %k3, %k3 1935; AVX512BW-NEXT: korw %k3, %k1, %k1 1936; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1937; AVX512BW-NEXT: kandw %k2, %k1, %k1 1938; AVX512BW-NEXT: kshiftrq $35, %k0, %k3 1939; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 1940; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 1941; AVX512BW-NEXT: korw %k6, %k1, %k1 1942; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1943; AVX512BW-NEXT: kandw %k2, %k1, %k1 1944; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 1945; AVX512BW-NEXT: korw %k6, %k1, %k1 1946; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1947; AVX512BW-NEXT: kandw %k2, %k1, %k1 1948; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 1949; AVX512BW-NEXT: korw %k3, %k1, %k1 1950; AVX512BW-NEXT: kandw %k5, %k1, %k1 1951; AVX512BW-NEXT: kshiftrq $36, %k0, %k3 1952; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 1953; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 1954; AVX512BW-NEXT: korw %k7, %k1, %k1 1955; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 1956; AVX512BW-NEXT: kandw %k7, %k1, %k1 1957; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 1958; AVX512BW-NEXT: korw %k6, %k1, %k1 1959; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 1960; AVX512BW-NEXT: kandw %k5, %k1, %k1 1961; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 1962; AVX512BW-NEXT: korw %k3, %k1, %k1 1963; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 1964; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 1965; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 1966; AVX512BW-NEXT: korw %k2, %k1, %k1 1967; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} 1968; AVX512BW-NEXT: kshiftrq $27, %k0, %k1 1969; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 1970; AVX512BW-NEXT: kshiftrq $26, %k0, %k3 1971; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 1972; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 1973; AVX512BW-NEXT: kandw %k1, %k3, %k3 1974; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 1975; AVX512BW-NEXT: korw %k6, %k3, %k3 1976; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 1977; AVX512BW-NEXT: kandw %k1, %k3, %k3 1978; AVX512BW-NEXT: kshiftrw $13, %k2, %k6 1979; AVX512BW-NEXT: korw %k6, %k3, %k3 1980; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 1981; AVX512BW-NEXT: kandw %k1, %k3, %k3 1982; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 1983; AVX512BW-NEXT: korw %k2, %k3, %k2 1984; AVX512BW-NEXT: kandw %k4, %k2, %k2 1985; AVX512BW-NEXT: kshiftrq $28, %k0, %k3 1986; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 1987; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 1988; AVX512BW-NEXT: korw %k6, %k2, %k2 1989; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 1990; AVX512BW-NEXT: kandw %k4, %k2, %k2 1991; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 1992; AVX512BW-NEXT: korw %k6, %k2, %k2 1993; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 1994; AVX512BW-NEXT: kandw %k1, %k2, %k2 1995; AVX512BW-NEXT: kshiftrw $9, %k3, %k3 1996; AVX512BW-NEXT: korw %k3, %k2, %k2 1997; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 1998; AVX512BW-NEXT: kandw %k1, %k2, %k2 1999; AVX512BW-NEXT: kshiftrq $29, %k0, %k3 2000; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 2001; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 2002; AVX512BW-NEXT: korw %k6, %k2, %k2 2003; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2004; AVX512BW-NEXT: kandw %k1, %k2, %k2 2005; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 2006; AVX512BW-NEXT: korw %k6, %k2, %k2 2007; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2008; AVX512BW-NEXT: kandw %k1, %k2, %k2 2009; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 2010; AVX512BW-NEXT: korw %k3, %k2, %k2 2011; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2012; AVX512BW-NEXT: kandw %k1, %k2, %k2 2013; AVX512BW-NEXT: kshiftrq $30, %k0, %k3 2014; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 2015; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 2016; AVX512BW-NEXT: korw %k6, %k2, %k2 2017; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 2018; AVX512BW-NEXT: kandw %k6, %k2, %k2 2019; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 2020; AVX512BW-NEXT: korw %k6, %k2, %k2 2021; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 2022; AVX512BW-NEXT: kandw %k6, %k2, %k2 2023; AVX512BW-NEXT: kshiftrw $3, %k3, %k3 2024; AVX512BW-NEXT: korw %k3, %k2, %k2 2025; AVX512BW-NEXT: kandw %k7, %k2, %k2 2026; AVX512BW-NEXT: kshiftrq $31, %k0, %k3 2027; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 2028; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 2029; AVX512BW-NEXT: korw %k7, %k2, %k2 2030; AVX512BW-NEXT: kandw %k5, %k2, %k2 2031; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 2032; AVX512BW-NEXT: korw %k3, %k2, %k2 2033; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 2034; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 2035; AVX512BW-NEXT: korw %k6, %k2, %k2 2036; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k2} {z} 2037; AVX512BW-NEXT: kshiftrq $21, %k0, %k2 2038; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 2039; AVX512BW-NEXT: kandw %k5, %k2, %k3 2040; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 2041; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2042; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 2043; AVX512BW-NEXT: korw %k6, %k3, %k3 2044; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 2045; AVX512BW-NEXT: kandw %k2, %k3, %k3 2046; AVX512BW-NEXT: kshiftrq $22, %k0, %k6 2047; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 2048; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 2049; AVX512BW-NEXT: korw %k7, %k3, %k3 2050; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 2051; AVX512BW-NEXT: kandw %k2, %k3, %k3 2052; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 2053; AVX512BW-NEXT: korw %k7, %k3, %k3 2054; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 2055; AVX512BW-NEXT: kandw %k2, %k3, %k3 2056; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 2057; AVX512BW-NEXT: korw %k6, %k3, %k3 2058; AVX512BW-NEXT: kandw %k4, %k3, %k3 2059; AVX512BW-NEXT: kshiftrq $23, %k0, %k6 2060; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 2061; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 2062; AVX512BW-NEXT: korw %k7, %k3, %k3 2063; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 2064; AVX512BW-NEXT: kandw %k4, %k3, %k3 2065; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 2066; AVX512BW-NEXT: korw %k7, %k3, %k3 2067; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 2068; AVX512BW-NEXT: kandw %k2, %k3, %k3 2069; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 2070; AVX512BW-NEXT: korw %k6, %k3, %k3 2071; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 2072; AVX512BW-NEXT: kandw %k2, %k3, %k3 2073; AVX512BW-NEXT: kshiftrq $24, %k0, %k6 2074; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 2075; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 2076; AVX512BW-NEXT: korw %k7, %k3, %k3 2077; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 2078; AVX512BW-NEXT: kandw %k7, %k3, %k3 2079; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 2080; AVX512BW-NEXT: korw %k7, %k3, %k3 2081; AVX512BW-NEXT: kandw %k1, %k3, %k3 2082; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 2083; AVX512BW-NEXT: korw %k6, %k3, %k3 2084; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2085; AVX512BW-NEXT: kandw %k1, %k3, %k3 2086; AVX512BW-NEXT: kshiftrq $25, %k0, %k6 2087; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 2088; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 2089; AVX512BW-NEXT: korw %k7, %k3, %k3 2090; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2091; AVX512BW-NEXT: kandw %k1, %k3, %k3 2092; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 2093; AVX512BW-NEXT: korw %k7, %k3, %k3 2094; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2095; AVX512BW-NEXT: kandw %k1, %k3, %k3 2096; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 2097; AVX512BW-NEXT: korw %k6, %k3, %k3 2098; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2099; AVX512BW-NEXT: kandw %k1, %k3, %k3 2100; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 8-byte Reload 2101; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 2102; AVX512BW-NEXT: korw %k6, %k3, %k3 2103; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 2104; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 2105; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 2106; AVX512BW-NEXT: korw %k1, %k3, %k1 2107; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k1} {z} 2108; AVX512BW-NEXT: kshiftrq $16, %k0, %k1 2109; AVX512BW-NEXT: kandw %k5, %k1, %k3 2110; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 2111; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 2112; AVX512BW-NEXT: korw %k6, %k3, %k3 2113; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 2114; AVX512BW-NEXT: kandw %k5, %k3, %k3 2115; AVX512BW-NEXT: kshiftrw $13, %k1, %k1 2116; AVX512BW-NEXT: korw %k1, %k3, %k1 2117; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 2118; AVX512BW-NEXT: kandw %k3, %k1, %k1 2119; AVX512BW-NEXT: kshiftrq $17, %k0, %k3 2120; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 2121; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 2122; AVX512BW-NEXT: korw %k6, %k1, %k1 2123; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 2124; AVX512BW-NEXT: kandw %k5, %k1, %k1 2125; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 2126; AVX512BW-NEXT: korw %k6, %k1, %k1 2127; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 2128; AVX512BW-NEXT: kandw %k5, %k1, %k1 2129; AVX512BW-NEXT: kshiftrw $10, %k3, %k3 2130; AVX512BW-NEXT: korw %k3, %k1, %k1 2131; AVX512BW-NEXT: kandw %k4, %k1, %k1 2132; AVX512BW-NEXT: kshiftrq $18, %k0, %k3 2133; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 2134; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 2135; AVX512BW-NEXT: korw %k6, %k1, %k1 2136; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 2137; AVX512BW-NEXT: kandw %k4, %k1, %k1 2138; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 2139; AVX512BW-NEXT: korw %k6, %k1, %k1 2140; AVX512BW-NEXT: kandw %k2, %k1, %k1 2141; AVX512BW-NEXT: kshiftrw $7, %k3, %k3 2142; AVX512BW-NEXT: korw %k3, %k1, %k1 2143; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 2144; AVX512BW-NEXT: kandw %k5, %k1, %k1 2145; AVX512BW-NEXT: kshiftrq $19, %k0, %k3 2146; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 2147; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 2148; AVX512BW-NEXT: korw %k6, %k1, %k1 2149; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 2150; AVX512BW-NEXT: kandw %k2, %k1, %k1 2151; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 2152; AVX512BW-NEXT: korw %k6, %k1, %k1 2153; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 2154; AVX512BW-NEXT: kandw %k2, %k1, %k1 2155; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 2156; AVX512BW-NEXT: korw %k3, %k1, %k1 2157; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 2158; AVX512BW-NEXT: kandw %k2, %k1, %k1 2159; AVX512BW-NEXT: kshiftrq $20, %k0, %k3 2160; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 2161; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 2162; AVX512BW-NEXT: korw %k7, %k1, %k1 2163; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 2164; AVX512BW-NEXT: kandw %k7, %k1, %k1 2165; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 2166; AVX512BW-NEXT: korw %k6, %k1, %k1 2167; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 2168; AVX512BW-NEXT: kandw %k2, %k1, %k1 2169; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 2170; AVX512BW-NEXT: korw %k3, %k1, %k1 2171; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 2172; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 2173; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 2174; AVX512BW-NEXT: korw %k2, %k1, %k1 2175; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} 2176; AVX512BW-NEXT: kshiftrq $11, %k0, %k1 2177; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 2178; AVX512BW-NEXT: kshiftrq $10, %k0, %k3 2179; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 2180; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2181; AVX512BW-NEXT: kandw %k1, %k3, %k3 2182; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 2183; AVX512BW-NEXT: korw %k6, %k3, %k3 2184; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2185; AVX512BW-NEXT: kandw %k1, %k3, %k3 2186; AVX512BW-NEXT: kshiftrw $13, %k2, %k6 2187; AVX512BW-NEXT: korw %k6, %k3, %k3 2188; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2189; AVX512BW-NEXT: kandw %k1, %k3, %k3 2190; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 2191; AVX512BW-NEXT: korw %k2, %k3, %k2 2192; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2193; AVX512BW-NEXT: kandw %k1, %k2, %k2 2194; AVX512BW-NEXT: kshiftrq $12, %k0, %k3 2195; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 2196; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 2197; AVX512BW-NEXT: korw %k6, %k2, %k2 2198; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2199; AVX512BW-NEXT: kandw %k1, %k2, %k2 2200; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 2201; AVX512BW-NEXT: korw %k6, %k2, %k2 2202; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2203; AVX512BW-NEXT: kandw %k1, %k2, %k2 2204; AVX512BW-NEXT: kshiftrw $9, %k3, %k3 2205; AVX512BW-NEXT: korw %k3, %k2, %k2 2206; AVX512BW-NEXT: kandw %k4, %k2, %k2 2207; AVX512BW-NEXT: kshiftrq $13, %k0, %k3 2208; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 2209; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 2210; AVX512BW-NEXT: korw %k6, %k2, %k2 2211; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2212; AVX512BW-NEXT: kandw %k1, %k2, %k2 2213; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 2214; AVX512BW-NEXT: korw %k6, %k2, %k2 2215; AVX512BW-NEXT: kandw %k5, %k2, %k2 2216; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 2217; AVX512BW-NEXT: korw %k3, %k2, %k2 2218; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 2219; AVX512BW-NEXT: kandw %k5, %k2, %k2 2220; AVX512BW-NEXT: kshiftrq $14, %k0, %k3 2221; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 2222; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 2223; AVX512BW-NEXT: korw %k6, %k2, %k2 2224; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2225; AVX512BW-NEXT: kandw %k1, %k2, %k2 2226; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 2227; AVX512BW-NEXT: korw %k6, %k2, %k2 2228; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 2229; AVX512BW-NEXT: kandw %k4, %k2, %k2 2230; AVX512BW-NEXT: kshiftrw $3, %k3, %k3 2231; AVX512BW-NEXT: korw %k3, %k2, %k2 2232; AVX512BW-NEXT: kandw %k7, %k2, %k2 2233; AVX512BW-NEXT: kshiftrq $15, %k0, %k3 2234; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 2235; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 2236; AVX512BW-NEXT: korw %k7, %k2, %k2 2237; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 2238; AVX512BW-NEXT: kandw %k7, %k2, %k2 2239; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 2240; AVX512BW-NEXT: korw %k3, %k2, %k2 2241; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 2242; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 2243; AVX512BW-NEXT: korw %k6, %k2, %k2 2244; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k2} {z} 2245; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload 2246; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 2247; AVX512BW-NEXT: kandw %k3, %k2, %k2 2248; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 2249; AVX512BW-NEXT: kshiftrw $14, %k3, %k3 2250; AVX512BW-NEXT: korw %k3, %k2, %k2 2251; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 2252; AVX512BW-NEXT: kandw %k3, %k2, %k2 2253; AVX512BW-NEXT: kshiftrq $6, %k0, %k3 2254; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 2255; AVX512BW-NEXT: kshiftrw $13, %k3, %k6 2256; AVX512BW-NEXT: korw %k6, %k2, %k2 2257; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 2258; AVX512BW-NEXT: kandw %k6, %k2, %k2 2259; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 2260; AVX512BW-NEXT: korw %k6, %k2, %k2 2261; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 2262; AVX512BW-NEXT: kandw %k6, %k2, %k2 2263; AVX512BW-NEXT: kshiftrw $11, %k3, %k3 2264; AVX512BW-NEXT: korw %k3, %k2, %k2 2265; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 2266; AVX512BW-NEXT: kandw %k3, %k2, %k2 2267; AVX512BW-NEXT: kshiftrq $7, %k0, %k3 2268; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 2269; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 2270; AVX512BW-NEXT: korw %k6, %k2, %k2 2271; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 2272; AVX512BW-NEXT: kandw %k6, %k2, %k2 2273; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 2274; AVX512BW-NEXT: korw %k6, %k2, %k2 2275; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 2276; AVX512BW-NEXT: kandw %k6, %k2, %k2 2277; AVX512BW-NEXT: kshiftrw $8, %k3, %k3 2278; AVX512BW-NEXT: korw %k3, %k2, %k2 2279; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 2280; AVX512BW-NEXT: kandw %k3, %k2, %k2 2281; AVX512BW-NEXT: kshiftrq $8, %k0, %k3 2282; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 2283; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 2284; AVX512BW-NEXT: korw %k6, %k2, %k2 2285; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 2286; AVX512BW-NEXT: kandw %k6, %k2, %k2 2287; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 2288; AVX512BW-NEXT: korw %k6, %k2, %k2 2289; AVX512BW-NEXT: kandw %k5, %k2, %k2 2290; AVX512BW-NEXT: kshiftrw $5, %k3, %k3 2291; AVX512BW-NEXT: korw %k3, %k2, %k2 2292; AVX512BW-NEXT: kshiftrq $9, %k0, %k0 2293; AVX512BW-NEXT: kandw %k1, %k2, %k2 2294; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 2295; AVX512BW-NEXT: kshiftrw $4, %k0, %k3 2296; AVX512BW-NEXT: korw %k3, %k2, %k2 2297; AVX512BW-NEXT: kandw %k4, %k2, %k2 2298; AVX512BW-NEXT: kshiftrw $3, %k0, %k3 2299; AVX512BW-NEXT: korw %k3, %k2, %k2 2300; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2301; AVX512BW-NEXT: kandw %k1, %k2, %k2 2302; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 2303; AVX512BW-NEXT: korw %k0, %k2, %k0 2304; AVX512BW-NEXT: kandw %k7, %k0, %k0 2305; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 8-byte Reload 2306; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 2307; AVX512BW-NEXT: korw %k2, %k0, %k0 2308; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 2309; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 2310; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 2311; AVX512BW-NEXT: korw %k1, %k0, %k1 2312; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm11 {%k1} {z} 2313; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rdx) 2314; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rdx) 2315; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rdx) 2316; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rdx) 2317; AVX512BW-NEXT: vmovdqa64 %zmm7, 320(%rdx) 2318; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) 2319; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%rdx) 2320; AVX512BW-NEXT: vmovdqa64 %zmm4, 512(%rdx) 2321; AVX512BW-NEXT: vmovdqa64 %zmm3, 576(%rdx) 2322; AVX512BW-NEXT: vmovdqa64 %zmm2, 640(%rdx) 2323; AVX512BW-NEXT: vmovdqa64 %zmm1, 704(%rdx) 2324; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) 2325; AVX512BW-NEXT: vzeroupper 2326; AVX512BW-NEXT: retq 2327 %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 2328 %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63> 2329 %data = call <192 x i32> @llvm.masked.load.v192i32.p0(ptr %in.vec, i32 64, <192 x i1> %tgt.mask, <192 x i32> poison) 2330 store <192 x i32> %data, ptr %out.vec, align 64 2331 ret void 2332} 2333 2334define void @mask_replication_factor4_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 2335; AVX512F-SLOW-LABEL: mask_replication_factor4_vf2: 2336; AVX512F-SLOW: # %bb.0: 2337; AVX512F-SLOW-NEXT: kmovw (%rdi), %k1 2338; AVX512F-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 2339; AVX512F-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 2340; AVX512F-SLOW-NEXT: vpmovsxdq %xmm0, %xmm0 2341; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] 2342; AVX512F-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1 2343; AVX512F-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} 2344; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rdx) 2345; AVX512F-SLOW-NEXT: vzeroupper 2346; AVX512F-SLOW-NEXT: retq 2347; 2348; AVX512F-FAST-LABEL: mask_replication_factor4_vf2: 2349; AVX512F-FAST: # %bb.0: 2350; AVX512F-FAST-NEXT: kmovw (%rdi), %k1 2351; AVX512F-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 2352; AVX512F-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 2353; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] 2354; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 2355; AVX512F-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1 2356; AVX512F-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} 2357; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) 2358; AVX512F-FAST-NEXT: vzeroupper 2359; AVX512F-FAST-NEXT: retq 2360; 2361; AVX512DQ-SLOW-LABEL: mask_replication_factor4_vf2: 2362; AVX512DQ-SLOW: # %bb.0: 2363; AVX512DQ-SLOW-NEXT: kmovb (%rdi), %k0 2364; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %ymm0 2365; AVX512DQ-SLOW-NEXT: vpmovsxdq %xmm0, %xmm0 2366; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] 2367; AVX512DQ-SLOW-NEXT: vpmovd2m %ymm0, %k1 2368; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} 2369; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rdx) 2370; AVX512DQ-SLOW-NEXT: vzeroupper 2371; AVX512DQ-SLOW-NEXT: retq 2372; 2373; AVX512DQ-FAST-LABEL: mask_replication_factor4_vf2: 2374; AVX512DQ-FAST: # %bb.0: 2375; AVX512DQ-FAST-NEXT: kmovb (%rdi), %k0 2376; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %ymm0 2377; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] 2378; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 2379; AVX512DQ-FAST-NEXT: vpmovd2m %ymm0, %k1 2380; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} 2381; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) 2382; AVX512DQ-FAST-NEXT: vzeroupper 2383; AVX512DQ-FAST-NEXT: retq 2384; 2385; AVX512BW-SLOW-LABEL: mask_replication_factor4_vf2: 2386; AVX512BW-SLOW: # %bb.0: 2387; AVX512BW-SLOW-NEXT: kmovw (%rdi), %k1 2388; AVX512BW-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 2389; AVX512BW-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 2390; AVX512BW-SLOW-NEXT: vpmovsxdq %xmm0, %xmm0 2391; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] 2392; AVX512BW-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1 2393; AVX512BW-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} 2394; AVX512BW-SLOW-NEXT: vmovdqa %ymm0, (%rdx) 2395; AVX512BW-SLOW-NEXT: vzeroupper 2396; AVX512BW-SLOW-NEXT: retq 2397; 2398; AVX512BW-FAST-LABEL: mask_replication_factor4_vf2: 2399; AVX512BW-FAST: # %bb.0: 2400; AVX512BW-FAST-NEXT: kmovw (%rdi), %k1 2401; AVX512BW-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 2402; AVX512BW-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 2403; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] 2404; AVX512BW-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 2405; AVX512BW-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1 2406; AVX512BW-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} 2407; AVX512BW-FAST-NEXT: vmovdqa %ymm0, (%rdx) 2408; AVX512BW-FAST-NEXT: vzeroupper 2409; AVX512BW-FAST-NEXT: retq 2410; 2411; AVX512VBMI-SLOW-LABEL: mask_replication_factor4_vf2: 2412; AVX512VBMI-SLOW: # %bb.0: 2413; AVX512VBMI-SLOW-NEXT: kmovw (%rdi), %k1 2414; AVX512VBMI-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 2415; AVX512VBMI-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 2416; AVX512VBMI-SLOW-NEXT: vpmovsxdq %xmm0, %xmm0 2417; AVX512VBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] 2418; AVX512VBMI-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1 2419; AVX512VBMI-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} 2420; AVX512VBMI-SLOW-NEXT: vmovdqa %ymm0, (%rdx) 2421; AVX512VBMI-SLOW-NEXT: vzeroupper 2422; AVX512VBMI-SLOW-NEXT: retq 2423; 2424; AVX512VBMI-FAST-LABEL: mask_replication_factor4_vf2: 2425; AVX512VBMI-FAST: # %bb.0: 2426; AVX512VBMI-FAST-NEXT: kmovw (%rdi), %k1 2427; AVX512VBMI-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 2428; AVX512VBMI-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 2429; AVX512VBMI-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] 2430; AVX512VBMI-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 2431; AVX512VBMI-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1 2432; AVX512VBMI-FAST-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} 2433; AVX512VBMI-FAST-NEXT: vmovdqa %ymm0, (%rdx) 2434; AVX512VBMI-FAST-NEXT: vzeroupper 2435; AVX512VBMI-FAST-NEXT: retq 2436 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 2437 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1> 2438 %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1> 2439 %data = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %in.vec, i32 64, <8 x i1> %tgt.mask, <8 x i32> poison) 2440 %data.padded = shufflevector <8 x i32> %data, <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2441 store <8 x i32> %data, ptr %out.vec, align 64 2442 ret void 2443} 2444 2445define void @mask_replication_factor4_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 2446; AVX512F-ONLY-LABEL: mask_replication_factor4_vf4: 2447; AVX512F-ONLY: # %bb.0: 2448; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 2449; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 2450; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] 2451; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 2452; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 2453; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 2454; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) 2455; AVX512F-ONLY-NEXT: vzeroupper 2456; AVX512F-ONLY-NEXT: retq 2457; 2458; AVX512DQ-LABEL: mask_replication_factor4_vf4: 2459; AVX512DQ: # %bb.0: 2460; AVX512DQ-NEXT: kmovw (%rdi), %k0 2461; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 2462; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] 2463; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 2464; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 2465; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 2466; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) 2467; AVX512DQ-NEXT: vzeroupper 2468; AVX512DQ-NEXT: retq 2469; 2470; AVX512BW-LABEL: mask_replication_factor4_vf4: 2471; AVX512BW: # %bb.0: 2472; AVX512BW-NEXT: kmovw (%rdi), %k1 2473; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 2474; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] 2475; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 2476; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 2477; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 2478; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) 2479; AVX512BW-NEXT: vzeroupper 2480; AVX512BW-NEXT: retq 2481 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 2482 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2483 %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3> 2484 %data = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr %in.vec, i32 64, <16 x i1> %tgt.mask, <16 x i32> poison) 2485 store <16 x i32> %data, ptr %out.vec, align 64 2486 ret void 2487} 2488 2489define void @mask_replication_factor4_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 2490; AVX512F-ONLY-LABEL: mask_replication_factor4_vf8: 2491; AVX512F-ONLY: # %bb.0: 2492; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 2493; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 2494; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] 2495; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 2496; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 2497; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] 2498; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 2499; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 2500; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} 2501; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} 2502; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) 2503; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) 2504; AVX512F-ONLY-NEXT: vzeroupper 2505; AVX512F-ONLY-NEXT: retq 2506; 2507; AVX512DQ-LABEL: mask_replication_factor4_vf8: 2508; AVX512DQ: # %bb.0: 2509; AVX512DQ-NEXT: kmovb (%rdi), %k0 2510; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 2511; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] 2512; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 2513; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 2514; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] 2515; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 2516; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 2517; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} 2518; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} 2519; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) 2520; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) 2521; AVX512DQ-NEXT: vzeroupper 2522; AVX512DQ-NEXT: retq 2523; 2524; AVX512BW-LABEL: mask_replication_factor4_vf8: 2525; AVX512BW: # %bb.0: 2526; AVX512BW-NEXT: kmovw (%rdi), %k0 2527; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 2528; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] 2529; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 2530; AVX512BW-NEXT: vpmovw2m %zmm0, %k1 2531; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 2532; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 2533; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} 2534; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) 2535; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) 2536; AVX512BW-NEXT: vzeroupper 2537; AVX512BW-NEXT: retq 2538 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 2539 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2540 %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7> 2541 %data = call <32 x i32> @llvm.masked.load.v32i32.p0(ptr %in.vec, i32 64, <32 x i1> %tgt.mask, <32 x i32> poison) 2542 store <32 x i32> %data, ptr %out.vec, align 64 2543 ret void 2544} 2545 2546define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 2547; AVX512F-ONLY-LABEL: mask_replication_factor4_vf16: 2548; AVX512F-ONLY: # %bb.0: 2549; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 2550; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 2551; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] 2552; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 2553; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 2554; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] 2555; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 2556; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 2557; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] 2558; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 2559; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 2560; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] 2561; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 2562; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 2563; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} 2564; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} 2565; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} 2566; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} 2567; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) 2568; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) 2569; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) 2570; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) 2571; AVX512F-ONLY-NEXT: vzeroupper 2572; AVX512F-ONLY-NEXT: retq 2573; 2574; AVX512DQ-LABEL: mask_replication_factor4_vf16: 2575; AVX512DQ: # %bb.0: 2576; AVX512DQ-NEXT: kmovw (%rdi), %k0 2577; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 2578; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] 2579; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 2580; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 2581; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] 2582; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 2583; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 2584; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] 2585; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 2586; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 2587; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] 2588; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 2589; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 2590; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} 2591; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} 2592; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} 2593; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} 2594; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) 2595; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) 2596; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) 2597; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) 2598; AVX512DQ-NEXT: vzeroupper 2599; AVX512DQ-NEXT: retq 2600; 2601; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf16: 2602; AVX512BW-ONLY: # %bb.0: 2603; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 2604; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 2605; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] 2606; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63] 2607; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 2608; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2 2609; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} 2610; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} 2611; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 2612; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} 2613; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 2614; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} 2615; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) 2616; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) 2617; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) 2618; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) 2619; AVX512BW-ONLY-NEXT: vzeroupper 2620; AVX512BW-ONLY-NEXT: retq 2621; 2622; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf16: 2623; AVX512VBMI-ONLY: # %bb.0: 2624; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 2625; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 2626; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] 2627; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 2628; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 2629; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 2630; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} 2631; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} 2632; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 2633; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} 2634; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 2635; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} 2636; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) 2637; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) 2638; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) 2639; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) 2640; AVX512VBMI-ONLY-NEXT: vzeroupper 2641; AVX512VBMI-ONLY-NEXT: retq 2642 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 2643 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2644 %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15> 2645 %data = call <64 x i32> @llvm.masked.load.v64i32.p0(ptr %in.vec, i32 64, <64 x i1> %tgt.mask, <64 x i32> poison) 2646 store <64 x i32> %data, ptr %out.vec, align 64 2647 ret void 2648} 2649 2650define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 2651; AVX512F-ONLY-LABEL: mask_replication_factor4_vf32: 2652; AVX512F-ONLY: # %bb.0: 2653; AVX512F-ONLY-NEXT: kmovw (%rdi), %k4 2654; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 2655; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 2656; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] 2657; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 2658; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 2659; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2660; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] 2661; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm3 2662; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k2 2663; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] 2664; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 2665; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k3 2666; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] 2667; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm4, %zmm0 2668; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k5 2669; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k4} {z} = -1 2670; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 2671; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 2672; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm1 2673; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k6 2674; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm1 2675; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k7 2676; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm4, %zmm0 2677; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 2678; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} 2679; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z} 2680; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z} 2681; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z} 2682; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k5} {z} 2683; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} 2684; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} 2685; AVX512F-ONLY-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2686; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} 2687; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) 2688; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) 2689; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) 2690; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) 2691; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) 2692; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) 2693; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) 2694; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) 2695; AVX512F-ONLY-NEXT: vzeroupper 2696; AVX512F-ONLY-NEXT: retq 2697; 2698; AVX512DQ-LABEL: mask_replication_factor4_vf32: 2699; AVX512DQ: # %bb.0: 2700; AVX512DQ-NEXT: kmovw (%rdi), %k0 2701; AVX512DQ-NEXT: kmovw 2(%rdi), %k1 2702; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 2703; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] 2704; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 2705; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 2706; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 2707; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] 2708; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm3 2709; AVX512DQ-NEXT: vpmovd2m %zmm3, %k2 2710; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] 2711; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 2712; AVX512DQ-NEXT: vpmovd2m %zmm4, %k3 2713; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] 2714; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm0 2715; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 2716; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 2717; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 2718; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 2719; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm1 2720; AVX512DQ-NEXT: vpmovd2m %zmm1, %k6 2721; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm1 2722; AVX512DQ-NEXT: vpmovd2m %zmm1, %k7 2723; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm0 2724; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 2725; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} 2726; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z} 2727; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z} 2728; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} 2729; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} 2730; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} 2731; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} 2732; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 2733; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} 2734; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rdx) 2735; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rdx) 2736; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx) 2737; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rdx) 2738; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) 2739; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) 2740; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) 2741; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) 2742; AVX512DQ-NEXT: vzeroupper 2743; AVX512DQ-NEXT: retq 2744; 2745; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf32: 2746; AVX512BW-ONLY: # %bb.0: 2747; AVX512BW-ONLY-NEXT: kmovd (%rdi), %k0 2748; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 2749; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] 2750; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] 2751; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 2752; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1 2753; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] 2754; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 2755; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2 2756; AVX512BW-ONLY-NEXT: kshiftrq $16, %k2, %k3 2757; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} 2758; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} 2759; AVX512BW-ONLY-NEXT: kshiftrq $48, %k2, %k3 2760; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} 2761; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 2762; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} 2763; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2 2764; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} 2765; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} 2766; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 2767; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} 2768; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 2769; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} 2770; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) 2771; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) 2772; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) 2773; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) 2774; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) 2775; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) 2776; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) 2777; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) 2778; AVX512BW-ONLY-NEXT: vzeroupper 2779; AVX512BW-ONLY-NEXT: retq 2780; 2781; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf32: 2782; AVX512VBMI-ONLY: # %bb.0: 2783; AVX512VBMI-ONLY-NEXT: kmovd (%rdi), %k0 2784; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 2785; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,17,17,17,17,18,18,18,18,19,19,19,19,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,24,24,24,24,25,25,25,25,26,26,26,26,27,27,27,27,28,28,28,28,29,29,29,29,30,30,30,30,31,31,31,31] 2786; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 2787; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1 2788; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] 2789; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 2790; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2 2791; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k2, %k3 2792; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} 2793; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} 2794; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k2, %k3 2795; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} 2796; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 2797; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} 2798; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 2799; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} 2800; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} 2801; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 2802; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} 2803; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 2804; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} 2805; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) 2806; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) 2807; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) 2808; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) 2809; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) 2810; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) 2811; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) 2812; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) 2813; AVX512VBMI-ONLY-NEXT: vzeroupper 2814; AVX512VBMI-ONLY-NEXT: retq 2815 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 2816 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 2817 %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31> 2818 %data = call <128 x i32> @llvm.masked.load.v128i32.p0(ptr %in.vec, i32 64, <128 x i1> %tgt.mask, <128 x i32> poison) 2819 store <128 x i32> %data, ptr %out.vec, align 64 2820 ret void 2821} 2822 2823define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 2824; AVX512F-ONLY-LABEL: mask_replication_factor4_vf64: 2825; AVX512F-ONLY: # %bb.0: 2826; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 2827; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 2828; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 2829; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 2830; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 2831; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1 2832; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 2833; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] 2834; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 2835; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] 2836; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 2837; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] 2838; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 2839; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] 2840; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm0 2841; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm10 2842; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm11 2843; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm7, %zmm12 2844; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm9, %zmm1 2845; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm3, %zmm13 2846; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm5, %zmm14 2847; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm7, %zmm15 2848; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm9, %zmm2 2849; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm16 {%k1} {z} = -1 2850; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm3, %zmm3 2851; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm5, %zmm5 2852; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm7, %zmm7 2853; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm9, %zmm9 2854; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 2855; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm9 {%k1} {z} 2856; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 2857; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm7 {%k1} {z} 2858; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 2859; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z} 2860; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 2861; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} 2862; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 2863; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z} 2864; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 2865; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm15 {%k1} {z} 2866; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 2867; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm14 {%k1} {z} 2868; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 2869; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm13 {%k1} {z} 2870; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 2871; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm1 {%k1} {z} 2872; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 2873; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm12 {%k1} {z} 2874; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 2875; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k1} {z} 2876; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 2877; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k1} {z} 2878; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 2879; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm0 {%k1} {z} 2880; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 2881; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm8 {%k1} {z} 2882; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 2883; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm6 {%k1} {z} 2884; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 2885; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm4 {%k1} {z} 2886; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 960(%rdx) 2887; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 896(%rdx) 2888; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 832(%rdx) 2889; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 768(%rdx) 2890; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 704(%rdx) 2891; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 640(%rdx) 2892; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 576(%rdx) 2893; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 512(%rdx) 2894; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 448(%rdx) 2895; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 384(%rdx) 2896; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 320(%rdx) 2897; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 256(%rdx) 2898; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) 2899; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 128(%rdx) 2900; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 64(%rdx) 2901; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, (%rdx) 2902; AVX512F-ONLY-NEXT: vzeroupper 2903; AVX512F-ONLY-NEXT: retq 2904; 2905; AVX512DQ-LABEL: mask_replication_factor4_vf64: 2906; AVX512DQ: # %bb.0: 2907; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 2908; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 2909; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 2910; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 2911; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 2912; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 2913; AVX512DQ-NEXT: kmovw (%rdi), %k0 2914; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] 2915; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 2916; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] 2917; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 2918; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] 2919; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 2920; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] 2921; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm0 2922; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm10 2923; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm11 2924; AVX512DQ-NEXT: vpermd %zmm1, %zmm7, %zmm12 2925; AVX512DQ-NEXT: vpermd %zmm1, %zmm9, %zmm1 2926; AVX512DQ-NEXT: vpermd %zmm2, %zmm3, %zmm13 2927; AVX512DQ-NEXT: vpermd %zmm2, %zmm5, %zmm14 2928; AVX512DQ-NEXT: vpermd %zmm2, %zmm7, %zmm15 2929; AVX512DQ-NEXT: vpermd %zmm2, %zmm9, %zmm2 2930; AVX512DQ-NEXT: vpmovm2d %k0, %zmm16 2931; AVX512DQ-NEXT: vpermd %zmm16, %zmm3, %zmm3 2932; AVX512DQ-NEXT: vpermd %zmm16, %zmm5, %zmm5 2933; AVX512DQ-NEXT: vpermd %zmm16, %zmm7, %zmm7 2934; AVX512DQ-NEXT: vpermd %zmm16, %zmm9, %zmm9 2935; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 2936; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm9 {%k1} {z} 2937; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 2938; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm7 {%k1} {z} 2939; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 2940; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z} 2941; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 2942; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} 2943; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 2944; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z} 2945; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 2946; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm15 {%k1} {z} 2947; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 2948; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm14 {%k1} {z} 2949; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 2950; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm13 {%k1} {z} 2951; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 2952; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm1 {%k1} {z} 2953; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 2954; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm12 {%k1} {z} 2955; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 2956; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k1} {z} 2957; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 2958; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k1} {z} 2959; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 2960; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm0 {%k1} {z} 2961; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 2962; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm8 {%k1} {z} 2963; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 2964; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm6 {%k1} {z} 2965; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 2966; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm4 {%k1} {z} 2967; AVX512DQ-NEXT: vmovdqa64 %zmm4, 960(%rdx) 2968; AVX512DQ-NEXT: vmovdqa64 %zmm6, 896(%rdx) 2969; AVX512DQ-NEXT: vmovdqa64 %zmm8, 832(%rdx) 2970; AVX512DQ-NEXT: vmovdqa64 %zmm0, 768(%rdx) 2971; AVX512DQ-NEXT: vmovdqa64 %zmm10, 704(%rdx) 2972; AVX512DQ-NEXT: vmovdqa64 %zmm11, 640(%rdx) 2973; AVX512DQ-NEXT: vmovdqa64 %zmm12, 576(%rdx) 2974; AVX512DQ-NEXT: vmovdqa64 %zmm1, 512(%rdx) 2975; AVX512DQ-NEXT: vmovdqa64 %zmm13, 448(%rdx) 2976; AVX512DQ-NEXT: vmovdqa64 %zmm14, 384(%rdx) 2977; AVX512DQ-NEXT: vmovdqa64 %zmm15, 320(%rdx) 2978; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rdx) 2979; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) 2980; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%rdx) 2981; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%rdx) 2982; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rdx) 2983; AVX512DQ-NEXT: vzeroupper 2984; AVX512DQ-NEXT: retq 2985; 2986; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf64: 2987; AVX512BW-ONLY: # %bb.0: 2988; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 2989; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 2990; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[6,7,6,7,6,7,6,7] 2991; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] 2992; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 2993; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1 2994; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5] 2995; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 2996; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k2 2997; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] 2998; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 2999; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k3 3000; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] 3001; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 3002; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k4 3003; AVX512BW-ONLY-NEXT: kshiftrq $16, %k4, %k5 3004; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} 3005; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} 3006; AVX512BW-ONLY-NEXT: kshiftrq $48, %k4, %k5 3007; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z} 3008; AVX512BW-ONLY-NEXT: kshiftrq $32, %k4, %k4 3009; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z} 3010; AVX512BW-ONLY-NEXT: kshiftrq $16, %k3, %k4 3011; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} 3012; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} 3013; AVX512BW-ONLY-NEXT: kshiftrq $48, %k3, %k4 3014; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z} 3015; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k3 3016; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z} 3017; AVX512BW-ONLY-NEXT: kshiftrq $16, %k2, %k3 3018; AVX512BW-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z} 3019; AVX512BW-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z} 3020; AVX512BW-ONLY-NEXT: kshiftrq $48, %k2, %k3 3021; AVX512BW-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z} 3022; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 3023; AVX512BW-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z} 3024; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2 3025; AVX512BW-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z} 3026; AVX512BW-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z} 3027; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 3028; AVX512BW-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z} 3029; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 3030; AVX512BW-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z} 3031; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm15, 896(%rdx) 3032; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm14, 960(%rdx) 3033; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm13, 768(%rdx) 3034; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm12, 832(%rdx) 3035; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm11, 640(%rdx) 3036; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm10, 704(%rdx) 3037; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm9, 512(%rdx) 3038; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm8, 576(%rdx) 3039; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) 3040; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) 3041; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) 3042; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) 3043; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) 3044; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) 3045; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) 3046; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) 3047; AVX512BW-ONLY-NEXT: vzeroupper 3048; AVX512BW-ONLY-NEXT: retq 3049; 3050; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf64: 3051; AVX512VBMI-ONLY: # %bb.0: 3052; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 3053; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 3054; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [48,48,48,48,49,49,49,49,50,50,50,50,51,51,51,51,52,52,52,52,53,53,53,53,54,54,54,54,55,55,55,55,56,56,56,56,57,57,57,57,58,58,58,58,59,59,59,59,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63] 3055; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 3056; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1 3057; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,32,32,32,33,33,33,33,34,34,34,34,35,35,35,35,36,36,36,36,37,37,37,37,38,38,38,38,39,39,39,39,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,44,44,44,44,45,45,45,45,46,46,46,46,47,47,47,47] 3058; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 3059; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k2 3060; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,17,17,17,17,18,18,18,18,19,19,19,19,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,24,24,24,24,25,25,25,25,26,26,26,26,27,27,27,27,28,28,28,28,29,29,29,29,30,30,30,30,31,31,31,31] 3061; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 3062; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k3 3063; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] 3064; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 3065; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k4 3066; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k4, %k5 3067; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} 3068; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} 3069; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k4, %k5 3070; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z} 3071; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k4, %k4 3072; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z} 3073; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k3, %k4 3074; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} 3075; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} 3076; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k3, %k4 3077; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z} 3078; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k3 3079; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z} 3080; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k2, %k3 3081; AVX512VBMI-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z} 3082; AVX512VBMI-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z} 3083; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k2, %k3 3084; AVX512VBMI-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z} 3085; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 3086; AVX512VBMI-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z} 3087; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 3088; AVX512VBMI-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z} 3089; AVX512VBMI-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z} 3090; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 3091; AVX512VBMI-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z} 3092; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 3093; AVX512VBMI-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z} 3094; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm15, 896(%rdx) 3095; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm14, 960(%rdx) 3096; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm13, 768(%rdx) 3097; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm12, 832(%rdx) 3098; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm11, 640(%rdx) 3099; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm10, 704(%rdx) 3100; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm9, 512(%rdx) 3101; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm8, 576(%rdx) 3102; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) 3103; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) 3104; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) 3105; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) 3106; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) 3107; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) 3108; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) 3109; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) 3110; AVX512VBMI-ONLY-NEXT: vzeroupper 3111; AVX512VBMI-ONLY-NEXT: retq 3112 %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 3113 %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63> 3114 %data = call <256 x i32> @llvm.masked.load.v256i32.p0(ptr %in.vec, i32 64, <256 x i1> %tgt.mask, <256 x i32> poison) 3115 store <256 x i32> %data, ptr %out.vec, align 64 3116 ret void 3117} 3118 3119define void @mask_replication_factor5_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 3120; AVX512F-ONLY-LABEL: mask_replication_factor5_vf2: 3121; AVX512F-ONLY: # %bb.0: 3122; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 3123; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 3124; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0] 3125; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 3126; AVX512F-ONLY-NEXT: vpslld $31, %zmm0, %zmm0 3127; AVX512F-ONLY-NEXT: movw $1023, %ax # imm = 0x3FF 3128; AVX512F-ONLY-NEXT: kmovw %eax, %k1 3129; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} 3130; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 3131; AVX512F-ONLY-NEXT: vextracti32x4 $2, %zmm0, %xmm1 3132; AVX512F-ONLY-NEXT: vmovq %xmm1, 32(%rdx) 3133; AVX512F-ONLY-NEXT: vmovdqa %ymm0, (%rdx) 3134; AVX512F-ONLY-NEXT: vzeroupper 3135; AVX512F-ONLY-NEXT: retq 3136; 3137; AVX512DQ-LABEL: mask_replication_factor5_vf2: 3138; AVX512DQ: # %bb.0: 3139; AVX512DQ-NEXT: kmovw (%rdi), %k0 3140; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 3141; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0] 3142; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 3143; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 3144; AVX512DQ-NEXT: movw $1023, %ax # imm = 0x3FF 3145; AVX512DQ-NEXT: kmovw %eax, %k1 3146; AVX512DQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 {%k1} 3147; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 3148; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1 3149; AVX512DQ-NEXT: vmovq %xmm1, 32(%rdx) 3150; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) 3151; AVX512DQ-NEXT: vzeroupper 3152; AVX512DQ-NEXT: retq 3153; 3154; AVX512BW-LABEL: mask_replication_factor5_vf2: 3155; AVX512BW: # %bb.0: 3156; AVX512BW-NEXT: kmovw (%rdi), %k1 3157; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 3158; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0] 3159; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 3160; AVX512BW-NEXT: vpslld $31, %zmm0, %zmm0 3161; AVX512BW-NEXT: movw $1023, %ax # imm = 0x3FF 3162; AVX512BW-NEXT: kmovd %eax, %k1 3163; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} 3164; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 3165; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm1 3166; AVX512BW-NEXT: vmovq %xmm1, 32(%rdx) 3167; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx) 3168; AVX512BW-NEXT: vzeroupper 3169; AVX512BW-NEXT: retq 3170 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 3171 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1> 3172 %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <10 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1> 3173 %data = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr %in.vec, i32 64, <10 x i1> %tgt.mask, <10 x i32> poison) 3174 %data.padded = shufflevector <10 x i32> %data, <10 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3175 store <10 x i32> %data, ptr %out.vec, align 64 3176 ret void 3177} 3178 3179define void @mask_replication_factor5_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 3180; AVX512F-ONLY-LABEL: mask_replication_factor5_vf4: 3181; AVX512F-ONLY: # %bb.0: 3182; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 3183; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 3184; AVX512F-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] 3185; AVX512F-ONLY-NEXT: vpslld $31, %zmm1, %zmm1 3186; AVX512F-ONLY-NEXT: movw $15, %ax 3187; AVX512F-ONLY-NEXT: kmovw %eax, %k1 3188; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} 3189; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] 3190; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 3191; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 3192; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} 3193; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} 3194; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) 3195; AVX512F-ONLY-NEXT: vmovdqa %xmm0, 64(%rdx) 3196; AVX512F-ONLY-NEXT: vzeroupper 3197; AVX512F-ONLY-NEXT: retq 3198; 3199; AVX512DQ-LABEL: mask_replication_factor5_vf4: 3200; AVX512DQ: # %bb.0: 3201; AVX512DQ-NEXT: kmovw (%rdi), %k0 3202; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 3203; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] 3204; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 3205; AVX512DQ-NEXT: movw $15, %ax 3206; AVX512DQ-NEXT: kmovw %eax, %k1 3207; AVX512DQ-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1} 3208; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] 3209; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 3210; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 3211; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} 3212; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} 3213; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) 3214; AVX512DQ-NEXT: vmovdqa %xmm0, 64(%rdx) 3215; AVX512DQ-NEXT: vzeroupper 3216; AVX512DQ-NEXT: retq 3217; 3218; AVX512BW-LABEL: mask_replication_factor5_vf4: 3219; AVX512BW: # %bb.0: 3220; AVX512BW-NEXT: kmovd (%rdi), %k0 3221; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 3222; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0] 3223; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 3224; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 3225; AVX512BW-NEXT: movl $1048575, %eax # imm = 0xFFFFF 3226; AVX512BW-NEXT: kmovd %eax, %k1 3227; AVX512BW-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 {%k1} 3228; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 3229; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} 3230; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} 3231; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) 3232; AVX512BW-NEXT: vmovdqa %xmm0, 64(%rdx) 3233; AVX512BW-NEXT: vzeroupper 3234; AVX512BW-NEXT: retq 3235 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 3236 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3237 %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <20 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3> 3238 %data = call <20 x i32> @llvm.masked.load.v20i32.p0(ptr %in.vec, i32 64, <20 x i1> %tgt.mask, <20 x i32> poison) 3239 %data.padded = shufflevector <20 x i32> %data, <20 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3240 store <20 x i32> %data, ptr %out.vec, align 64 3241 ret void 3242} 3243 3244define void @mask_replication_factor5_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 3245; AVX512F-ONLY-LABEL: mask_replication_factor5_vf8: 3246; AVX512F-ONLY: # %bb.0: 3247; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 3248; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 3249; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] 3250; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 3251; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 3252; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1 3253; AVX512F-ONLY-NEXT: movw $1, %ax 3254; AVX512F-ONLY-NEXT: kmovw %eax, %k2 3255; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} 3256; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 3257; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] 3258; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 3259; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 3260; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 3261; AVX512F-ONLY-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 3262; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,6,6,7,7,7,7,7] 3263; AVX512F-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0 3264; AVX512F-ONLY-NEXT: vptestmd %ymm0, %ymm0, %k1 3265; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} 3266; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k1} {z} 3267; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k3} {z} 3268; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) 3269; AVX512F-ONLY-NEXT: vmovdqa %ymm1, 128(%rdx) 3270; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) 3271; AVX512F-ONLY-NEXT: vzeroupper 3272; AVX512F-ONLY-NEXT: retq 3273; 3274; AVX512DQ-LABEL: mask_replication_factor5_vf8: 3275; AVX512DQ: # %bb.0: 3276; AVX512DQ-NEXT: kmovb (%rdi), %k0 3277; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 3278; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] 3279; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 3280; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 3281; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1 3282; AVX512DQ-NEXT: movw $1, %ax 3283; AVX512DQ-NEXT: kmovw %eax, %k1 3284; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 3285; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 3286; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] 3287; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 3288; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 3289; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0 3290; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,6,6,7,7,7,7,7] 3291; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 3292; AVX512DQ-NEXT: vpmovd2m %ymm0, %k3 3293; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 3294; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z} 3295; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} 3296; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rdx) 3297; AVX512DQ-NEXT: vmovdqa %ymm1, 128(%rdx) 3298; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) 3299; AVX512DQ-NEXT: vzeroupper 3300; AVX512DQ-NEXT: retq 3301; 3302; AVX512BW-ONLY-LABEL: mask_replication_factor5_vf8: 3303; AVX512BW-ONLY: # %bb.0: 3304; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k0 3305; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 3306; AVX512BW-ONLY-NEXT: vpbroadcastq %xmm0, %zmm0 3307; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,19,19,19,19,20,20,20,20,20,21,21,21,21,21,22,22,38,38,38,39,39,39,39,39,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 3308; AVX512BW-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1 3309; AVX512BW-ONLY-NEXT: movabsq $1099511627775, %rax # imm = 0xFFFFFFFFFF 3310; AVX512BW-ONLY-NEXT: kmovq %rax, %k1 3311; AVX512BW-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1} 3312; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k2 3313; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm0 {%k2} {z} 3314; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} 3315; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k1 3316; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z} 3317; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) 3318; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) 3319; AVX512BW-ONLY-NEXT: vmovdqa %ymm0, 128(%rdx) 3320; AVX512BW-ONLY-NEXT: vzeroupper 3321; AVX512BW-ONLY-NEXT: retq 3322; 3323; AVX512VBMI-ONLY-LABEL: mask_replication_factor5_vf8: 3324; AVX512VBMI-ONLY: # %bb.0: 3325; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k0 3326; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 3327; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6,6,6,6,7,7,7,7,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 3328; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 3329; AVX512VBMI-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1 3330; AVX512VBMI-ONLY-NEXT: movabsq $1099511627775, %rax # imm = 0xFFFFFFFFFF 3331; AVX512VBMI-ONLY-NEXT: kmovq %rax, %k1 3332; AVX512VBMI-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1} 3333; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k2 3334; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm0 {%k2} {z} 3335; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} 3336; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k1 3337; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z} 3338; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) 3339; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) 3340; AVX512VBMI-ONLY-NEXT: vmovdqa %ymm0, 128(%rdx) 3341; AVX512VBMI-ONLY-NEXT: vzeroupper 3342; AVX512VBMI-ONLY-NEXT: retq 3343 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 3344 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3345 %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <40 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7> 3346 %data = call <40 x i32> @llvm.masked.load.v40i32.p0(ptr %in.vec, i32 64, <40 x i1> %tgt.mask, <40 x i32> poison) 3347 %data.padded = shufflevector <40 x i32> %data, <40 x i32> poison, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3348 store <40 x i32> %data, ptr %out.vec, align 64 3349 ret void 3350} 3351 3352define void @mask_replication_factor5_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 3353; AVX512F-ONLY-LABEL: mask_replication_factor5_vf16: 3354; AVX512F-ONLY: # %bb.0: 3355; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 3356; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 3357; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] 3358; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 3359; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 3360; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 3361; AVX512F-ONLY-NEXT: movw $1, %ax 3362; AVX512F-ONLY-NEXT: kmovw %eax, %k1 3363; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 3364; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 3365; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] 3366; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 3367; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 3368; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] 3369; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 3370; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 3371; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] 3372; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 3373; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 3374; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] 3375; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 3376; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k5 3377; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 3378; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm1 {%k5} {z} 3379; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k4} {z} 3380; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k3} {z} 3381; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k2} {z} 3382; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx) 3383; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) 3384; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) 3385; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 256(%rdx) 3386; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) 3387; AVX512F-ONLY-NEXT: vzeroupper 3388; AVX512F-ONLY-NEXT: retq 3389; 3390; AVX512DQ-LABEL: mask_replication_factor5_vf16: 3391; AVX512DQ: # %bb.0: 3392; AVX512DQ-NEXT: kmovw (%rdi), %k0 3393; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 3394; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] 3395; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 3396; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 3397; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 3398; AVX512DQ-NEXT: movw $1, %ax 3399; AVX512DQ-NEXT: kmovw %eax, %k1 3400; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 3401; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 3402; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] 3403; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 3404; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 3405; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] 3406; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 3407; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 3408; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] 3409; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 3410; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 3411; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] 3412; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 3413; AVX512DQ-NEXT: vpmovd2m %zmm0, %k5 3414; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 3415; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm1 {%k5} {z} 3416; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k4} {z} 3417; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k3} {z} 3418; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k2} {z} 3419; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx) 3420; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) 3421; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) 3422; AVX512DQ-NEXT: vmovdqa64 %zmm1, 256(%rdx) 3423; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) 3424; AVX512DQ-NEXT: vzeroupper 3425; AVX512DQ-NEXT: retq 3426; 3427; AVX512BW-LABEL: mask_replication_factor5_vf16: 3428; AVX512BW: # %bb.0: 3429; AVX512BW-NEXT: kmovw (%rdi), %k1 3430; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 3431; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] 3432; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 3433; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 3434; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} 3435; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] 3436; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 3437; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 3438; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z} 3439; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] 3440; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm3 3441; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1 3442; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} 3443; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] 3444; AVX512BW-NEXT: vpermd %zmm0, %zmm4, %zmm4 3445; AVX512BW-NEXT: vptestmd %zmm4, %zmm4, %k1 3446; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k1} {z} 3447; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] 3448; AVX512BW-NEXT: vpermd %zmm0, %zmm5, %zmm0 3449; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 3450; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} 3451; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) 3452; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rdx) 3453; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) 3454; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%rdx) 3455; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) 3456; AVX512BW-NEXT: vzeroupper 3457; AVX512BW-NEXT: retq 3458 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 3459 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3460 %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <80 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15> 3461 %data = call <80 x i32> @llvm.masked.load.v80i32.p0(ptr %in.vec, i32 64, <80 x i1> %tgt.mask, <80 x i32> poison) 3462 store <80 x i32> %data, ptr %out.vec, align 64 3463 ret void 3464} 3465 3466define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 3467; AVX512F-ONLY-LABEL: mask_replication_factor5_vf32: 3468; AVX512F-ONLY: # %bb.0: 3469; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 3470; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 3471; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] 3472; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 3473; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 3474; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1 3475; AVX512F-ONLY-NEXT: movw $1, %ax 3476; AVX512F-ONLY-NEXT: kmovw %eax, %k1 3477; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} 3478; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 3479; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 3480; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 3481; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] 3482; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm4 3483; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] 3484; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 3485; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] 3486; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 3487; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] 3488; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm0 3489; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm1 3490; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm2 3491; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm5, %zmm5 3492; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm7 3493; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm3 3494; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm9 {%k1} {z} 3495; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 3496; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} 3497; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 3498; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm7 {%k1} {z} 3499; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 3500; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} 3501; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 3502; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z} 3503; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 3504; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} 3505; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 3506; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm0 {%k1} {z} 3507; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 3508; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} 3509; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 3510; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} 3511; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 3512; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} 3513; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx) 3514; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 128(%rdx) 3515; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx) 3516; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 256(%rdx) 3517; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 320(%rdx) 3518; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 384(%rdx) 3519; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 448(%rdx) 3520; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 512(%rdx) 3521; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 576(%rdx) 3522; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, (%rdx) 3523; AVX512F-ONLY-NEXT: vzeroupper 3524; AVX512F-ONLY-NEXT: retq 3525; 3526; AVX512DQ-LABEL: mask_replication_factor5_vf32: 3527; AVX512DQ: # %bb.0: 3528; AVX512DQ-NEXT: kmovw (%rdi), %k0 3529; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 3530; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] 3531; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 3532; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 3533; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 3534; AVX512DQ-NEXT: movw $1, %ax 3535; AVX512DQ-NEXT: kmovw %eax, %k1 3536; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} 3537; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 3538; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 3539; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 3540; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] 3541; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm4 3542; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] 3543; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 3544; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] 3545; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 3546; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] 3547; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm0 3548; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm1 3549; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm2 3550; AVX512DQ-NEXT: vpermd %zmm3, %zmm5, %zmm5 3551; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm7 3552; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm3 3553; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm9 {%k1} {z} 3554; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 3555; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} 3556; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 3557; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm7 {%k1} {z} 3558; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 3559; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} 3560; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 3561; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z} 3562; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 3563; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} 3564; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 3565; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm0 {%k1} {z} 3566; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 3567; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} 3568; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 3569; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} 3570; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 3571; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} 3572; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx) 3573; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rdx) 3574; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx) 3575; AVX512DQ-NEXT: vmovdqa64 %zmm0, 256(%rdx) 3576; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rdx) 3577; AVX512DQ-NEXT: vmovdqa64 %zmm2, 384(%rdx) 3578; AVX512DQ-NEXT: vmovdqa64 %zmm5, 448(%rdx) 3579; AVX512DQ-NEXT: vmovdqa64 %zmm7, 512(%rdx) 3580; AVX512DQ-NEXT: vmovdqa64 %zmm3, 576(%rdx) 3581; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rdx) 3582; AVX512DQ-NEXT: vzeroupper 3583; AVX512DQ-NEXT: retq 3584; 3585; AVX512BW-LABEL: mask_replication_factor5_vf32: 3586; AVX512BW: # %bb.0: 3587; AVX512BW-NEXT: kmovd (%rdi), %k5 3588; AVX512BW-NEXT: kshiftrd $1, %k5, %k1 3589; AVX512BW-NEXT: movw $-3, %ax 3590; AVX512BW-NEXT: kmovd %eax, %k6 3591; AVX512BW-NEXT: kmovw (%rdi), %k2 3592; AVX512BW-NEXT: kandw %k6, %k2, %k3 3593; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 3594; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 3595; AVX512BW-NEXT: korw %k4, %k3, %k3 3596; AVX512BW-NEXT: movw $-5, %ax 3597; AVX512BW-NEXT: kmovd %eax, %k4 3598; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3599; AVX512BW-NEXT: kandw %k4, %k3, %k3 3600; AVX512BW-NEXT: kshiftrw $13, %k2, %k4 3601; AVX512BW-NEXT: korw %k4, %k3, %k3 3602; AVX512BW-NEXT: movw $-9, %ax 3603; AVX512BW-NEXT: kmovd %eax, %k4 3604; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3605; AVX512BW-NEXT: kandw %k4, %k3, %k3 3606; AVX512BW-NEXT: kshiftrw $12, %k2, %k4 3607; AVX512BW-NEXT: korw %k4, %k3, %k3 3608; AVX512BW-NEXT: movw $-17, %ax 3609; AVX512BW-NEXT: kmovd %eax, %k4 3610; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3611; AVX512BW-NEXT: kandw %k4, %k3, %k3 3612; AVX512BW-NEXT: kshiftrw $11, %k2, %k2 3613; AVX512BW-NEXT: korw %k2, %k3, %k2 3614; AVX512BW-NEXT: movw $-33, %ax 3615; AVX512BW-NEXT: kmovd %eax, %k3 3616; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3617; AVX512BW-NEXT: kandw %k3, %k2, %k2 3618; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 3619; AVX512BW-NEXT: kshiftrw $10, %k1, %k3 3620; AVX512BW-NEXT: korw %k3, %k2, %k2 3621; AVX512BW-NEXT: movw $-65, %ax 3622; AVX512BW-NEXT: kmovd %eax, %k3 3623; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3624; AVX512BW-NEXT: kandw %k3, %k2, %k2 3625; AVX512BW-NEXT: kshiftrw $9, %k1, %k3 3626; AVX512BW-NEXT: korw %k3, %k2, %k2 3627; AVX512BW-NEXT: movw $-129, %ax 3628; AVX512BW-NEXT: kmovd %eax, %k3 3629; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3630; AVX512BW-NEXT: kandw %k3, %k2, %k2 3631; AVX512BW-NEXT: kshiftrw $8, %k1, %k3 3632; AVX512BW-NEXT: korw %k3, %k2, %k2 3633; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF 3634; AVX512BW-NEXT: kmovd %eax, %k3 3635; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3636; AVX512BW-NEXT: kandw %k3, %k2, %k2 3637; AVX512BW-NEXT: kshiftrw $7, %k1, %k3 3638; AVX512BW-NEXT: korw %k3, %k2, %k2 3639; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF 3640; AVX512BW-NEXT: kmovd %eax, %k7 3641; AVX512BW-NEXT: kandw %k7, %k2, %k2 3642; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3643; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 3644; AVX512BW-NEXT: korw %k1, %k2, %k1 3645; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF 3646; AVX512BW-NEXT: kmovd %eax, %k2 3647; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3648; AVX512BW-NEXT: kandw %k2, %k1, %k3 3649; AVX512BW-NEXT: kshiftrd $2, %k5, %k1 3650; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 3651; AVX512BW-NEXT: kshiftrw $5, %k2, %k4 3652; AVX512BW-NEXT: korw %k4, %k3, %k3 3653; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF 3654; AVX512BW-NEXT: kmovd %eax, %k4 3655; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3656; AVX512BW-NEXT: kandw %k4, %k3, %k3 3657; AVX512BW-NEXT: kshiftrw $4, %k2, %k4 3658; AVX512BW-NEXT: korw %k4, %k3, %k3 3659; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF 3660; AVX512BW-NEXT: kmovd %eax, %k4 3661; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3662; AVX512BW-NEXT: kandw %k4, %k3, %k3 3663; AVX512BW-NEXT: kshiftrw $3, %k2, %k4 3664; AVX512BW-NEXT: korw %k4, %k3, %k3 3665; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF 3666; AVX512BW-NEXT: kmovd %eax, %k4 3667; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3668; AVX512BW-NEXT: kandw %k4, %k3, %k3 3669; AVX512BW-NEXT: kshiftrw $2, %k2, %k2 3670; AVX512BW-NEXT: korw %k2, %k3, %k2 3671; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF 3672; AVX512BW-NEXT: kmovd %eax, %k3 3673; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3674; AVX512BW-NEXT: kandw %k3, %k2, %k2 3675; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 3676; AVX512BW-NEXT: korw %k1, %k2, %k1 3677; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 3678; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 3679; AVX512BW-NEXT: kshiftrd $3, %k5, %k2 3680; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 3681; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 3682; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3683; AVX512BW-NEXT: korw %k2, %k1, %k1 3684; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 3685; AVX512BW-NEXT: kshiftrd $29, %k5, %k1 3686; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 3687; AVX512BW-NEXT: kshiftrd $28, %k5, %k1 3688; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 3689; AVX512BW-NEXT: kandw %k6, %k1, %k3 3690; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3691; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 3692; AVX512BW-NEXT: korw %k4, %k3, %k3 3693; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 3694; AVX512BW-NEXT: kandw %k0, %k3, %k3 3695; AVX512BW-NEXT: kshiftrw $13, %k2, %k4 3696; AVX512BW-NEXT: korw %k4, %k3, %k3 3697; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 3698; AVX512BW-NEXT: kandw %k1, %k3, %k3 3699; AVX512BW-NEXT: kshiftrw $12, %k2, %k4 3700; AVX512BW-NEXT: korw %k4, %k3, %k3 3701; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 3702; AVX512BW-NEXT: kandw %k1, %k3, %k3 3703; AVX512BW-NEXT: kshiftrw $11, %k2, %k4 3704; AVX512BW-NEXT: korw %k4, %k3, %k3 3705; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 3706; AVX512BW-NEXT: kandw %k1, %k3, %k3 3707; AVX512BW-NEXT: kshiftrw $10, %k2, %k2 3708; AVX512BW-NEXT: korw %k2, %k3, %k2 3709; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 3710; AVX512BW-NEXT: kandw %k1, %k2, %k2 3711; AVX512BW-NEXT: kshiftrd $30, %k5, %k3 3712; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 3713; AVX512BW-NEXT: kshiftrw $9, %k3, %k4 3714; AVX512BW-NEXT: korw %k4, %k2, %k2 3715; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 3716; AVX512BW-NEXT: kandw %k1, %k2, %k2 3717; AVX512BW-NEXT: kshiftrw $8, %k3, %k4 3718; AVX512BW-NEXT: korw %k4, %k2, %k2 3719; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 3720; AVX512BW-NEXT: kandw %k1, %k2, %k2 3721; AVX512BW-NEXT: kshiftrw $7, %k3, %k4 3722; AVX512BW-NEXT: korw %k4, %k2, %k2 3723; AVX512BW-NEXT: kandw %k7, %k2, %k2 3724; AVX512BW-NEXT: kshiftrw $6, %k3, %k4 3725; AVX512BW-NEXT: korw %k4, %k2, %k2 3726; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 3727; AVX512BW-NEXT: kandw %k1, %k2, %k2 3728; AVX512BW-NEXT: kshiftrw $5, %k3, %k3 3729; AVX512BW-NEXT: korw %k3, %k2, %k2 3730; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 3731; AVX512BW-NEXT: kandw %k1, %k2, %k2 3732; AVX512BW-NEXT: kshiftrd $31, %k5, %k3 3733; AVX512BW-NEXT: kshiftlw $15, %k3, %k4 3734; AVX512BW-NEXT: kshiftrw $4, %k4, %k7 3735; AVX512BW-NEXT: korw %k7, %k2, %k2 3736; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 3737; AVX512BW-NEXT: kandw %k7, %k2, %k2 3738; AVX512BW-NEXT: kshiftrw $3, %k4, %k7 3739; AVX512BW-NEXT: korw %k7, %k2, %k2 3740; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 3741; AVX512BW-NEXT: kandw %k7, %k2, %k2 3742; AVX512BW-NEXT: kshiftrw $2, %k4, %k7 3743; AVX512BW-NEXT: korw %k7, %k2, %k2 3744; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 3745; AVX512BW-NEXT: kandw %k7, %k2, %k2 3746; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 3747; AVX512BW-NEXT: korw %k3, %k2, %k2 3748; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 3749; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 3750; AVX512BW-NEXT: korw %k4, %k2, %k2 3751; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm1 {%k2} {z} 3752; AVX512BW-NEXT: kshiftrd $25, %k5, %k2 3753; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 3754; AVX512BW-NEXT: kandw %k6, %k2, %k3 3755; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 3756; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3757; AVX512BW-NEXT: kshiftrw $14, %k2, %k7 3758; AVX512BW-NEXT: korw %k7, %k3, %k3 3759; AVX512BW-NEXT: kandw %k0, %k3, %k3 3760; AVX512BW-NEXT: kshiftrd $26, %k5, %k7 3761; AVX512BW-NEXT: kshiftlw $15, %k7, %k7 3762; AVX512BW-NEXT: kshiftrw $13, %k7, %k6 3763; AVX512BW-NEXT: korw %k6, %k3, %k3 3764; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 3765; AVX512BW-NEXT: kandw %k0, %k3, %k3 3766; AVX512BW-NEXT: kshiftrw $12, %k7, %k6 3767; AVX512BW-NEXT: korw %k6, %k3, %k3 3768; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 3769; AVX512BW-NEXT: kandw %k2, %k3, %k3 3770; AVX512BW-NEXT: kshiftrw $11, %k7, %k6 3771; AVX512BW-NEXT: korw %k6, %k3, %k3 3772; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 3773; AVX512BW-NEXT: kandw %k2, %k3, %k3 3774; AVX512BW-NEXT: kshiftrw $10, %k7, %k6 3775; AVX512BW-NEXT: korw %k6, %k3, %k3 3776; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 3777; AVX512BW-NEXT: kandw %k0, %k3, %k3 3778; AVX512BW-NEXT: kshiftrw $9, %k7, %k6 3779; AVX512BW-NEXT: korw %k6, %k3, %k3 3780; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 3781; AVX512BW-NEXT: kandw %k2, %k3, %k3 3782; AVX512BW-NEXT: kshiftrd $27, %k5, %k6 3783; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 3784; AVX512BW-NEXT: kshiftrw $8, %k6, %k7 3785; AVX512BW-NEXT: korw %k7, %k3, %k3 3786; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 3787; AVX512BW-NEXT: kandw %k2, %k3, %k3 3788; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 3789; AVX512BW-NEXT: korw %k7, %k3, %k3 3790; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 3791; AVX512BW-NEXT: kandw %k4, %k3, %k3 3792; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 3793; AVX512BW-NEXT: korw %k7, %k3, %k3 3794; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 3795; AVX512BW-NEXT: kandw %k4, %k3, %k3 3796; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 3797; AVX512BW-NEXT: korw %k7, %k3, %k3 3798; AVX512BW-NEXT: kandw %k1, %k3, %k3 3799; AVX512BW-NEXT: kshiftrw $4, %k6, %k6 3800; AVX512BW-NEXT: korw %k6, %k3, %k3 3801; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 3802; AVX512BW-NEXT: kandw %k1, %k3, %k3 3803; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload 3804; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 3805; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 3806; AVX512BW-NEXT: korw %k7, %k3, %k3 3807; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 3808; AVX512BW-NEXT: kandw %k1, %k3, %k3 3809; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 3810; AVX512BW-NEXT: korw %k7, %k3, %k3 3811; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 3812; AVX512BW-NEXT: kandw %k4, %k3, %k3 3813; AVX512BW-NEXT: kshiftlw $14, %k0, %k1 3814; AVX512BW-NEXT: korw %k1, %k3, %k1 3815; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 3816; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 3817; AVX512BW-NEXT: korw %k6, %k1, %k1 3818; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z} 3819; AVX512BW-NEXT: kshiftrd $22, %k5, %k0 3820; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 3821; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 3822; AVX512BW-NEXT: kandw %k1, %k0, %k6 3823; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 3824; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3825; AVX512BW-NEXT: kshiftrw $14, %k0, %k7 3826; AVX512BW-NEXT: korw %k7, %k6, %k6 3827; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 3828; AVX512BW-NEXT: kandw %k1, %k6, %k6 3829; AVX512BW-NEXT: kshiftrw $13, %k0, %k7 3830; AVX512BW-NEXT: korw %k7, %k6, %k6 3831; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 3832; AVX512BW-NEXT: kandw %k0, %k6, %k6 3833; AVX512BW-NEXT: kshiftrd $23, %k5, %k7 3834; AVX512BW-NEXT: kmovq %k5, %k0 3835; AVX512BW-NEXT: kshiftlw $15, %k7, %k7 3836; AVX512BW-NEXT: kshiftrw $12, %k7, %k5 3837; AVX512BW-NEXT: korw %k5, %k6, %k5 3838; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 3839; AVX512BW-NEXT: kandw %k1, %k5, %k5 3840; AVX512BW-NEXT: kshiftrw $11, %k7, %k6 3841; AVX512BW-NEXT: korw %k6, %k5, %k5 3842; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 3843; AVX512BW-NEXT: kandw %k1, %k5, %k5 3844; AVX512BW-NEXT: kshiftrw $10, %k7, %k6 3845; AVX512BW-NEXT: korw %k6, %k5, %k5 3846; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 3847; AVX512BW-NEXT: kandw %k1, %k5, %k5 3848; AVX512BW-NEXT: kshiftrw $9, %k7, %k6 3849; AVX512BW-NEXT: korw %k6, %k5, %k5 3850; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 3851; AVX512BW-NEXT: kandw %k1, %k5, %k5 3852; AVX512BW-NEXT: kshiftrw $8, %k7, %k6 3853; AVX512BW-NEXT: korw %k6, %k5, %k5 3854; AVX512BW-NEXT: kandw %k2, %k5, %k5 3855; AVX512BW-NEXT: kshiftrd $24, %k0, %k6 3856; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 3857; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 3858; AVX512BW-NEXT: korw %k7, %k5, %k5 3859; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 3860; AVX512BW-NEXT: kandw %k1, %k5, %k5 3861; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 3862; AVX512BW-NEXT: korw %k7, %k5, %k5 3863; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 3864; AVX512BW-NEXT: kandw %k3, %k5, %k5 3865; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 3866; AVX512BW-NEXT: korw %k7, %k5, %k5 3867; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 3868; AVX512BW-NEXT: kandw %k2, %k5, %k5 3869; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 3870; AVX512BW-NEXT: korw %k7, %k5, %k5 3871; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 3872; AVX512BW-NEXT: kandw %k2, %k5, %k5 3873; AVX512BW-NEXT: kshiftrw $3, %k6, %k6 3874; AVX512BW-NEXT: korw %k6, %k5, %k5 3875; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 3876; AVX512BW-NEXT: kandw %k2, %k5, %k5 3877; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 3878; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 3879; AVX512BW-NEXT: korw %k6, %k5, %k5 3880; AVX512BW-NEXT: kandw %k4, %k5, %k5 3881; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload 3882; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 3883; AVX512BW-NEXT: korw %k2, %k5, %k2 3884; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 3885; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 3886; AVX512BW-NEXT: korw %k7, %k2, %k2 3887; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm3 {%k2} {z} 3888; AVX512BW-NEXT: kshiftrd $19, %k0, %k2 3889; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 3890; AVX512BW-NEXT: kandw %k7, %k2, %k4 3891; AVX512BW-NEXT: kshiftlw $15, %k2, %k6 3892; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 3893; AVX512BW-NEXT: kshiftrw $14, %k6, %k5 3894; AVX512BW-NEXT: korw %k5, %k4, %k4 3895; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 3896; AVX512BW-NEXT: kandw %k2, %k4, %k4 3897; AVX512BW-NEXT: kshiftrw $13, %k6, %k5 3898; AVX512BW-NEXT: korw %k5, %k4, %k4 3899; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 3900; AVX512BW-NEXT: kandw %k2, %k4, %k4 3901; AVX512BW-NEXT: kshiftrw $12, %k6, %k5 3902; AVX512BW-NEXT: korw %k5, %k4, %k4 3903; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 3904; AVX512BW-NEXT: kandw %k2, %k4, %k4 3905; AVX512BW-NEXT: kshiftrd $20, %k0, %k5 3906; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 3907; AVX512BW-NEXT: kshiftrw $11, %k5, %k6 3908; AVX512BW-NEXT: korw %k6, %k4, %k4 3909; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 3910; AVX512BW-NEXT: kandw %k2, %k4, %k4 3911; AVX512BW-NEXT: kshiftrw $10, %k5, %k6 3912; AVX512BW-NEXT: korw %k6, %k4, %k4 3913; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 3914; AVX512BW-NEXT: kandw %k6, %k4, %k4 3915; AVX512BW-NEXT: kshiftrw $9, %k5, %k6 3916; AVX512BW-NEXT: korw %k6, %k4, %k4 3917; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 3918; AVX512BW-NEXT: kandw %k6, %k4, %k4 3919; AVX512BW-NEXT: kshiftrw $8, %k5, %k6 3920; AVX512BW-NEXT: korw %k6, %k4, %k4 3921; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 3922; AVX512BW-NEXT: kandw %k6, %k4, %k4 3923; AVX512BW-NEXT: kshiftrw $7, %k5, %k5 3924; AVX512BW-NEXT: korw %k5, %k4, %k4 3925; AVX512BW-NEXT: kandw %k1, %k4, %k4 3926; AVX512BW-NEXT: kshiftrd $21, %k0, %k5 3927; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 3928; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 3929; AVX512BW-NEXT: korw %k6, %k4, %k4 3930; AVX512BW-NEXT: kandw %k3, %k4, %k4 3931; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 3932; AVX512BW-NEXT: korw %k6, %k4, %k4 3933; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 3934; AVX512BW-NEXT: kandw %k1, %k4, %k4 3935; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 3936; AVX512BW-NEXT: korw %k6, %k4, %k4 3937; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 3938; AVX512BW-NEXT: kandw %k1, %k4, %k4 3939; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 3940; AVX512BW-NEXT: korw %k6, %k4, %k4 3941; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 3942; AVX512BW-NEXT: kandw %k1, %k4, %k4 3943; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 3944; AVX512BW-NEXT: korw %k5, %k4, %k4 3945; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 3946; AVX512BW-NEXT: kandw %k1, %k4, %k4 3947; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload 3948; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 3949; AVX512BW-NEXT: korw %k1, %k4, %k1 3950; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 3951; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 3952; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 3953; AVX512BW-NEXT: korw %k3, %k1, %k1 3954; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm4 {%k1} {z} 3955; AVX512BW-NEXT: kshiftrd $16, %k0, %k1 3956; AVX512BW-NEXT: kandw %k7, %k1, %k3 3957; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 3958; AVX512BW-NEXT: kshiftrw $14, %k1, %k4 3959; AVX512BW-NEXT: korw %k4, %k3, %k3 3960; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 3961; AVX512BW-NEXT: kandw %k6, %k3, %k3 3962; AVX512BW-NEXT: kshiftrw $13, %k1, %k4 3963; AVX512BW-NEXT: korw %k4, %k3, %k3 3964; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 3965; AVX512BW-NEXT: kandw %k7, %k3, %k3 3966; AVX512BW-NEXT: kshiftrw $12, %k1, %k4 3967; AVX512BW-NEXT: korw %k4, %k3, %k3 3968; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 3969; AVX512BW-NEXT: kandw %k4, %k3, %k3 3970; AVX512BW-NEXT: kshiftrw $11, %k1, %k1 3971; AVX512BW-NEXT: korw %k1, %k3, %k1 3972; AVX512BW-NEXT: kandw %k2, %k1, %k1 3973; AVX512BW-NEXT: kshiftrd $17, %k0, %k3 3974; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 3975; AVX512BW-NEXT: kshiftrw $10, %k3, %k4 3976; AVX512BW-NEXT: korw %k4, %k1, %k1 3977; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 3978; AVX512BW-NEXT: kandw %k2, %k1, %k1 3979; AVX512BW-NEXT: kshiftrw $9, %k3, %k4 3980; AVX512BW-NEXT: korw %k4, %k1, %k1 3981; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 3982; AVX512BW-NEXT: kandw %k2, %k1, %k1 3983; AVX512BW-NEXT: kshiftrw $8, %k3, %k4 3984; AVX512BW-NEXT: korw %k4, %k1, %k1 3985; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 3986; AVX512BW-NEXT: kandw %k2, %k1, %k1 3987; AVX512BW-NEXT: kshiftrw $7, %k3, %k4 3988; AVX512BW-NEXT: korw %k4, %k1, %k1 3989; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 3990; AVX512BW-NEXT: kandw %k2, %k1, %k1 3991; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 3992; AVX512BW-NEXT: korw %k3, %k1, %k1 3993; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 3994; AVX512BW-NEXT: kandw %k2, %k1, %k1 3995; AVX512BW-NEXT: kshiftrd $18, %k0, %k3 3996; AVX512BW-NEXT: kshiftlw $15, %k3, %k4 3997; AVX512BW-NEXT: kshiftrw $5, %k4, %k5 3998; AVX512BW-NEXT: korw %k5, %k1, %k1 3999; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4000; AVX512BW-NEXT: kandw %k2, %k1, %k1 4001; AVX512BW-NEXT: kshiftrw $4, %k4, %k5 4002; AVX512BW-NEXT: korw %k5, %k1, %k1 4003; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4004; AVX512BW-NEXT: kandw %k2, %k1, %k1 4005; AVX512BW-NEXT: kshiftrw $3, %k4, %k5 4006; AVX512BW-NEXT: korw %k5, %k1, %k1 4007; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4008; AVX512BW-NEXT: kandw %k2, %k1, %k1 4009; AVX512BW-NEXT: kshiftrw $2, %k4, %k4 4010; AVX512BW-NEXT: korw %k4, %k1, %k1 4011; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4012; AVX512BW-NEXT: kandw %k2, %k1, %k1 4013; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 4014; AVX512BW-NEXT: korw %k3, %k1, %k1 4015; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 4016; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 4017; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4018; AVX512BW-NEXT: korw %k2, %k1, %k1 4019; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} 4020; AVX512BW-NEXT: kshiftrd $13, %k0, %k1 4021; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 4022; AVX512BW-NEXT: kshiftrd $12, %k0, %k3 4023; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 4024; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4025; AVX512BW-NEXT: kandw %k2, %k3, %k2 4026; AVX512BW-NEXT: kshiftrw $14, %k1, %k4 4027; AVX512BW-NEXT: korw %k4, %k2, %k2 4028; AVX512BW-NEXT: kandw %k6, %k2, %k2 4029; AVX512BW-NEXT: kshiftrw $13, %k1, %k4 4030; AVX512BW-NEXT: korw %k4, %k2, %k2 4031; AVX512BW-NEXT: kandw %k7, %k2, %k2 4032; AVX512BW-NEXT: kshiftrw $12, %k1, %k4 4033; AVX512BW-NEXT: korw %k4, %k2, %k2 4034; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 4035; AVX512BW-NEXT: kandw %k7, %k2, %k2 4036; AVX512BW-NEXT: kshiftrw $11, %k1, %k4 4037; AVX512BW-NEXT: korw %k4, %k2, %k2 4038; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 4039; AVX512BW-NEXT: kandw %k3, %k2, %k2 4040; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 4041; AVX512BW-NEXT: korw %k1, %k2, %k1 4042; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4043; AVX512BW-NEXT: kandw %k2, %k1, %k1 4044; AVX512BW-NEXT: kshiftrd $14, %k0, %k2 4045; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 4046; AVX512BW-NEXT: kshiftrw $9, %k2, %k4 4047; AVX512BW-NEXT: korw %k4, %k1, %k1 4048; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 4049; AVX512BW-NEXT: kandw %k3, %k1, %k1 4050; AVX512BW-NEXT: kshiftrw $8, %k2, %k4 4051; AVX512BW-NEXT: korw %k4, %k1, %k1 4052; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 4053; AVX512BW-NEXT: kandw %k3, %k1, %k1 4054; AVX512BW-NEXT: kshiftrw $7, %k2, %k4 4055; AVX512BW-NEXT: korw %k4, %k1, %k1 4056; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 4057; AVX512BW-NEXT: kandw %k3, %k1, %k1 4058; AVX512BW-NEXT: kshiftrw $6, %k2, %k4 4059; AVX512BW-NEXT: korw %k4, %k1, %k1 4060; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 4061; AVX512BW-NEXT: kandw %k3, %k1, %k1 4062; AVX512BW-NEXT: kshiftrw $5, %k2, %k2 4063; AVX512BW-NEXT: korw %k2, %k1, %k1 4064; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4065; AVX512BW-NEXT: kandw %k2, %k1, %k1 4066; AVX512BW-NEXT: kshiftrd $15, %k0, %k2 4067; AVX512BW-NEXT: kshiftlw $15, %k2, %k4 4068; AVX512BW-NEXT: kshiftrw $4, %k4, %k5 4069; AVX512BW-NEXT: korw %k5, %k1, %k1 4070; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 4071; AVX512BW-NEXT: kandw %k3, %k1, %k1 4072; AVX512BW-NEXT: kshiftrw $3, %k4, %k5 4073; AVX512BW-NEXT: korw %k5, %k1, %k1 4074; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 4075; AVX512BW-NEXT: kandw %k5, %k1, %k1 4076; AVX512BW-NEXT: kshiftrw $2, %k4, %k5 4077; AVX512BW-NEXT: korw %k5, %k1, %k1 4078; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 4079; AVX512BW-NEXT: kandw %k5, %k1, %k1 4080; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 4081; AVX512BW-NEXT: korw %k2, %k1, %k1 4082; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 4083; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 4084; AVX512BW-NEXT: korw %k4, %k1, %k1 4085; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm6 {%k1} {z} 4086; AVX512BW-NEXT: kshiftrd $9, %k0, %k2 4087; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 4088; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4089; AVX512BW-NEXT: kandw %k1, %k2, %k4 4090; AVX512BW-NEXT: kshiftlw $15, %k2, %k1 4091; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4092; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 4093; AVX512BW-NEXT: korw %k5, %k4, %k4 4094; AVX512BW-NEXT: kandw %k6, %k4, %k4 4095; AVX512BW-NEXT: kshiftrd $10, %k0, %k5 4096; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 4097; AVX512BW-NEXT: kshiftrw $13, %k5, %k6 4098; AVX512BW-NEXT: korw %k6, %k4, %k4 4099; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4100; AVX512BW-NEXT: kandw %k1, %k4, %k4 4101; AVX512BW-NEXT: kshiftrw $12, %k5, %k6 4102; AVX512BW-NEXT: korw %k6, %k4, %k4 4103; AVX512BW-NEXT: kandw %k7, %k4, %k4 4104; AVX512BW-NEXT: kshiftrw $11, %k5, %k6 4105; AVX512BW-NEXT: korw %k6, %k4, %k4 4106; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4107; AVX512BW-NEXT: kandw %k1, %k4, %k4 4108; AVX512BW-NEXT: kshiftrw $10, %k5, %k6 4109; AVX512BW-NEXT: korw %k6, %k4, %k4 4110; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4111; AVX512BW-NEXT: kandw %k1, %k4, %k4 4112; AVX512BW-NEXT: kshiftrw $9, %k5, %k5 4113; AVX512BW-NEXT: korw %k5, %k4, %k4 4114; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4115; AVX512BW-NEXT: kandw %k1, %k4, %k4 4116; AVX512BW-NEXT: kshiftrd $11, %k0, %k5 4117; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 4118; AVX512BW-NEXT: kshiftrw $8, %k5, %k6 4119; AVX512BW-NEXT: korw %k6, %k4, %k4 4120; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4121; AVX512BW-NEXT: kandw %k1, %k4, %k4 4122; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 4123; AVX512BW-NEXT: korw %k6, %k4, %k4 4124; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4125; AVX512BW-NEXT: kandw %k1, %k4, %k4 4126; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 4127; AVX512BW-NEXT: korw %k6, %k4, %k4 4128; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4129; AVX512BW-NEXT: kandw %k1, %k4, %k4 4130; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 4131; AVX512BW-NEXT: korw %k6, %k4, %k4 4132; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4133; AVX512BW-NEXT: kandw %k2, %k4, %k4 4134; AVX512BW-NEXT: kshiftrw $4, %k5, %k5 4135; AVX512BW-NEXT: korw %k5, %k4, %k4 4136; AVX512BW-NEXT: kandw %k3, %k4, %k4 4137; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload 4138; AVX512BW-NEXT: kshiftlw $15, %k7, %k5 4139; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 4140; AVX512BW-NEXT: korw %k6, %k4, %k4 4141; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4142; AVX512BW-NEXT: kandw %k1, %k4, %k4 4143; AVX512BW-NEXT: kshiftrw $2, %k5, %k6 4144; AVX512BW-NEXT: korw %k6, %k4, %k4 4145; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 4146; AVX512BW-NEXT: kandw %k3, %k4, %k4 4147; AVX512BW-NEXT: kshiftlw $14, %k7, %k3 4148; AVX512BW-NEXT: korw %k3, %k4, %k3 4149; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 4150; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 4151; AVX512BW-NEXT: korw %k5, %k3, %k3 4152; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k3} {z} 4153; AVX512BW-NEXT: kshiftrd $6, %k0, %k4 4154; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 4155; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 4156; AVX512BW-NEXT: kandw %k3, %k4, %k5 4157; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 4158; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4159; AVX512BW-NEXT: kshiftrw $14, %k4, %k6 4160; AVX512BW-NEXT: korw %k6, %k5, %k5 4161; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 4162; AVX512BW-NEXT: kandw %k3, %k5, %k5 4163; AVX512BW-NEXT: kshiftrw $13, %k4, %k6 4164; AVX512BW-NEXT: korw %k6, %k5, %k5 4165; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 4166; AVX512BW-NEXT: kandw %k3, %k5, %k5 4167; AVX512BW-NEXT: kshiftrd $7, %k0, %k6 4168; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 4169; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 4170; AVX512BW-NEXT: korw %k7, %k5, %k5 4171; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 4172; AVX512BW-NEXT: kandw %k3, %k5, %k5 4173; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 4174; AVX512BW-NEXT: korw %k7, %k5, %k5 4175; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 4176; AVX512BW-NEXT: kandw %k3, %k5, %k5 4177; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 4178; AVX512BW-NEXT: korw %k7, %k5, %k5 4179; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 4180; AVX512BW-NEXT: kandw %k4, %k5, %k5 4181; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 4182; AVX512BW-NEXT: korw %k7, %k5, %k5 4183; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 4184; AVX512BW-NEXT: kandw %k3, %k5, %k5 4185; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 4186; AVX512BW-NEXT: korw %k6, %k5, %k5 4187; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 4188; AVX512BW-NEXT: kandw %k6, %k5, %k5 4189; AVX512BW-NEXT: kshiftrd $8, %k0, %k6 4190; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 4191; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 4192; AVX512BW-NEXT: korw %k7, %k5, %k5 4193; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 4194; AVX512BW-NEXT: kandw %k7, %k5, %k5 4195; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 4196; AVX512BW-NEXT: korw %k7, %k5, %k5 4197; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 4198; AVX512BW-NEXT: kandw %k7, %k5, %k5 4199; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 4200; AVX512BW-NEXT: korw %k7, %k5, %k5 4201; AVX512BW-NEXT: kandw %k2, %k5, %k5 4202; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 4203; AVX512BW-NEXT: korw %k7, %k5, %k5 4204; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 4205; AVX512BW-NEXT: kandw %k7, %k5, %k5 4206; AVX512BW-NEXT: kshiftrw $3, %k6, %k6 4207; AVX512BW-NEXT: korw %k6, %k5, %k5 4208; AVX512BW-NEXT: kandw %k1, %k5, %k5 4209; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4210; AVX512BW-NEXT: kshiftrw $2, %k1, %k6 4211; AVX512BW-NEXT: korw %k6, %k5, %k5 4212; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 4213; AVX512BW-NEXT: kandw %k6, %k5, %k5 4214; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload 4215; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 4216; AVX512BW-NEXT: korw %k2, %k5, %k2 4217; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 4218; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 4219; AVX512BW-NEXT: korw %k1, %k2, %k1 4220; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm8 {%k1} {z} 4221; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload 4222; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4223; AVX512BW-NEXT: kandw %k2, %k1, %k1 4224; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 4225; AVX512BW-NEXT: kshiftrw $14, %k5, %k2 4226; AVX512BW-NEXT: korw %k2, %k1, %k1 4227; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4228; AVX512BW-NEXT: kandw %k2, %k1, %k1 4229; AVX512BW-NEXT: kshiftrw $13, %k5, %k2 4230; AVX512BW-NEXT: korw %k2, %k1, %k1 4231; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4232; AVX512BW-NEXT: kandw %k2, %k1, %k1 4233; AVX512BW-NEXT: kshiftrw $12, %k5, %k2 4234; AVX512BW-NEXT: korw %k2, %k1, %k1 4235; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4236; AVX512BW-NEXT: kandw %k2, %k1, %k1 4237; AVX512BW-NEXT: kshiftrd $4, %k0, %k2 4238; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 4239; AVX512BW-NEXT: kshiftrw $11, %k2, %k5 4240; AVX512BW-NEXT: korw %k5, %k1, %k1 4241; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 4242; AVX512BW-NEXT: kandw %k5, %k1, %k1 4243; AVX512BW-NEXT: kshiftrw $10, %k2, %k5 4244; AVX512BW-NEXT: korw %k5, %k1, %k1 4245; AVX512BW-NEXT: kandw %k4, %k1, %k1 4246; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 4247; AVX512BW-NEXT: korw %k5, %k1, %k1 4248; AVX512BW-NEXT: kandw %k3, %k1, %k1 4249; AVX512BW-NEXT: kshiftrw $8, %k2, %k5 4250; AVX512BW-NEXT: korw %k5, %k1, %k1 4251; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 4252; AVX512BW-NEXT: kandw %k3, %k1, %k1 4253; AVX512BW-NEXT: kshiftrw $7, %k2, %k2 4254; AVX512BW-NEXT: korw %k2, %k1, %k1 4255; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4256; AVX512BW-NEXT: kandw %k2, %k1, %k1 4257; AVX512BW-NEXT: kshiftrd $5, %k0, %k0 4258; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 4259; AVX512BW-NEXT: kshiftrw $6, %k0, %k2 4260; AVX512BW-NEXT: korw %k2, %k1, %k1 4261; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4262; AVX512BW-NEXT: kandw %k2, %k1, %k1 4263; AVX512BW-NEXT: kshiftrw $5, %k0, %k2 4264; AVX512BW-NEXT: korw %k2, %k1, %k1 4265; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4266; AVX512BW-NEXT: kandw %k2, %k1, %k1 4267; AVX512BW-NEXT: kshiftrw $4, %k0, %k2 4268; AVX512BW-NEXT: korw %k2, %k1, %k1 4269; AVX512BW-NEXT: kandw %k7, %k1, %k1 4270; AVX512BW-NEXT: kshiftrw $3, %k0, %k2 4271; AVX512BW-NEXT: korw %k2, %k1, %k1 4272; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4273; AVX512BW-NEXT: kandw %k2, %k1, %k1 4274; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 4275; AVX512BW-NEXT: korw %k0, %k1, %k0 4276; AVX512BW-NEXT: kandw %k6, %k0, %k0 4277; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload 4278; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 4279; AVX512BW-NEXT: korw %k1, %k0, %k0 4280; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 4281; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 4282; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4283; AVX512BW-NEXT: korw %k1, %k0, %k1 4284; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm9 {%k1} {z} 4285; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rdx) 4286; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rdx) 4287; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rdx) 4288; AVX512BW-NEXT: vmovdqa64 %zmm6, 256(%rdx) 4289; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) 4290; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%rdx) 4291; AVX512BW-NEXT: vmovdqa64 %zmm3, 448(%rdx) 4292; AVX512BW-NEXT: vmovdqa64 %zmm2, 512(%rdx) 4293; AVX512BW-NEXT: vmovdqa64 %zmm1, 576(%rdx) 4294; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) 4295; AVX512BW-NEXT: vzeroupper 4296; AVX512BW-NEXT: retq 4297 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 4298 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 4299 %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <160 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31> 4300 %data = call <160 x i32> @llvm.masked.load.v160i32.p0(ptr %in.vec, i32 64, <160 x i1> %tgt.mask, <160 x i32> poison) 4301 store <160 x i32> %data, ptr %out.vec, align 64 4302 ret void 4303} 4304 4305define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 4306; AVX512F-ONLY-LABEL: mask_replication_factor5_vf64: 4307; AVX512F-ONLY: # %bb.0: 4308; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 4309; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 4310; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] 4311; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm1 4312; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 4313; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 4314; AVX512F-ONLY-NEXT: movw $1, %ax 4315; AVX512F-ONLY-NEXT: kmovw %eax, %k1 4316; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 4317; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 4318; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm4 {%k1} {z} = -1 4319; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 4320; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm5 {%k1} {z} = -1 4321; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 4322; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm6 {%k1} {z} = -1 4323; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 4324; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] 4325; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm1 4326; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm8 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] 4327; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm8, %zmm2 4328; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] 4329; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm9, %zmm10 4330; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] 4331; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm12 4332; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm3, %zmm4 4333; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm13 4334; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm8, %zmm14 4335; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm9, %zmm15 4336; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm11, %zmm16 4337; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm3, %zmm5 4338; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm7, %zmm17 4339; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm8, %zmm18 4340; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm9, %zmm19 4341; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm3, %zmm3 4342; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm11, %zmm6 4343; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm7 4344; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm8, %zmm8 4345; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm9 4346; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm0 4347; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z} 4348; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 4349; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} 4350; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 4351; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} 4352; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 4353; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} 4354; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 4355; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z} 4356; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 4357; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} 4358; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 4359; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} 4360; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1 4361; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm19 {%k1} {z} 4362; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 4363; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z} 4364; AVX512F-ONLY-NEXT: vptestmd %zmm17, %zmm17, %k1 4365; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm17 {%k1} {z} 4366; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 4367; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm5 {%k1} {z} 4368; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1 4369; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm16 {%k1} {z} 4370; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 4371; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm15 {%k1} {z} 4372; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 4373; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k1} {z} 4374; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 4375; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm13 {%k1} {z} 4376; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 4377; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm4 {%k1} {z} 4378; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 4379; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm12 {%k1} {z} 4380; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 4381; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm10 {%k1} {z} 4382; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 4383; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm2 {%k1} {z} 4384; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 4385; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm1 {%k1} {z} 4386; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 1216(%rdx) 4387; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 1152(%rdx) 4388; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1088(%rdx) 4389; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1024(%rdx) 4390; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 960(%rdx) 4391; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 896(%rdx) 4392; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 832(%rdx) 4393; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 768(%rdx) 4394; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 704(%rdx) 4395; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 640(%rdx) 4396; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 576(%rdx) 4397; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 512(%rdx) 4398; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 448(%rdx) 4399; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) 4400; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 320(%rdx) 4401; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 256(%rdx) 4402; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx) 4403; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 128(%rdx) 4404; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) 4405; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, (%rdx) 4406; AVX512F-ONLY-NEXT: vzeroupper 4407; AVX512F-ONLY-NEXT: retq 4408; 4409; AVX512DQ-LABEL: mask_replication_factor5_vf64: 4410; AVX512DQ: # %bb.0: 4411; AVX512DQ-NEXT: kmovw (%rdi), %k0 4412; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 4413; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] 4414; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm1 4415; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 4416; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 4417; AVX512DQ-NEXT: movw $1, %ax 4418; AVX512DQ-NEXT: kmovw %eax, %k1 4419; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 4420; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 4421; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 4422; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 4423; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 4424; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 4425; AVX512DQ-NEXT: vpmovm2d %k0, %zmm6 4426; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 4427; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] 4428; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm1 4429; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] 4430; AVX512DQ-NEXT: vpermd %zmm4, %zmm8, %zmm2 4431; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] 4432; AVX512DQ-NEXT: vpermd %zmm4, %zmm9, %zmm10 4433; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] 4434; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm12 4435; AVX512DQ-NEXT: vpermd %zmm4, %zmm3, %zmm4 4436; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm13 4437; AVX512DQ-NEXT: vpermd %zmm5, %zmm8, %zmm14 4438; AVX512DQ-NEXT: vpermd %zmm5, %zmm9, %zmm15 4439; AVX512DQ-NEXT: vpermd %zmm5, %zmm11, %zmm16 4440; AVX512DQ-NEXT: vpermd %zmm5, %zmm3, %zmm5 4441; AVX512DQ-NEXT: vpermd %zmm6, %zmm7, %zmm17 4442; AVX512DQ-NEXT: vpermd %zmm6, %zmm8, %zmm18 4443; AVX512DQ-NEXT: vpermd %zmm6, %zmm9, %zmm19 4444; AVX512DQ-NEXT: vpermd %zmm6, %zmm3, %zmm3 4445; AVX512DQ-NEXT: vpermd %zmm6, %zmm11, %zmm6 4446; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm7 4447; AVX512DQ-NEXT: vpermd %zmm0, %zmm8, %zmm8 4448; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm9 4449; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm0 4450; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z} 4451; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 4452; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} 4453; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 4454; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} 4455; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 4456; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} 4457; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 4458; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z} 4459; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 4460; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} 4461; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 4462; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} 4463; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1 4464; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm19 {%k1} {z} 4465; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 4466; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z} 4467; AVX512DQ-NEXT: vpmovd2m %zmm17, %k1 4468; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm17 {%k1} {z} 4469; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 4470; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm5 {%k1} {z} 4471; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1 4472; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm16 {%k1} {z} 4473; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 4474; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm15 {%k1} {z} 4475; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 4476; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k1} {z} 4477; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 4478; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm13 {%k1} {z} 4479; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 4480; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm4 {%k1} {z} 4481; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 4482; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm12 {%k1} {z} 4483; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 4484; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm10 {%k1} {z} 4485; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 4486; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm2 {%k1} {z} 4487; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 4488; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm1 {%k1} {z} 4489; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1216(%rdx) 4490; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1152(%rdx) 4491; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1088(%rdx) 4492; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1024(%rdx) 4493; AVX512DQ-NEXT: vmovdqa64 %zmm4, 960(%rdx) 4494; AVX512DQ-NEXT: vmovdqa64 %zmm13, 896(%rdx) 4495; AVX512DQ-NEXT: vmovdqa64 %zmm14, 832(%rdx) 4496; AVX512DQ-NEXT: vmovdqa64 %zmm15, 768(%rdx) 4497; AVX512DQ-NEXT: vmovdqa64 %zmm16, 704(%rdx) 4498; AVX512DQ-NEXT: vmovdqa64 %zmm5, 640(%rdx) 4499; AVX512DQ-NEXT: vmovdqa64 %zmm17, 576(%rdx) 4500; AVX512DQ-NEXT: vmovdqa64 %zmm18, 512(%rdx) 4501; AVX512DQ-NEXT: vmovdqa64 %zmm19, 448(%rdx) 4502; AVX512DQ-NEXT: vmovdqa64 %zmm6, 384(%rdx) 4503; AVX512DQ-NEXT: vmovdqa64 %zmm3, 320(%rdx) 4504; AVX512DQ-NEXT: vmovdqa64 %zmm7, 256(%rdx) 4505; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx) 4506; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rdx) 4507; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) 4508; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rdx) 4509; AVX512DQ-NEXT: vzeroupper 4510; AVX512DQ-NEXT: retq 4511; 4512; AVX512BW-LABEL: mask_replication_factor5_vf64: 4513; AVX512BW: # %bb.0: 4514; AVX512BW-NEXT: kmovq (%rdi), %k5 4515; AVX512BW-NEXT: kshiftrq $1, %k5, %k0 4516; AVX512BW-NEXT: movw $-3, %ax 4517; AVX512BW-NEXT: kmovd %eax, %k1 4518; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4519; AVX512BW-NEXT: kmovw (%rdi), %k2 4520; AVX512BW-NEXT: kandw %k1, %k2, %k3 4521; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 4522; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 4523; AVX512BW-NEXT: korw %k4, %k3, %k3 4524; AVX512BW-NEXT: movw $-5, %ax 4525; AVX512BW-NEXT: kmovd %eax, %k1 4526; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4527; AVX512BW-NEXT: kandw %k1, %k3, %k3 4528; AVX512BW-NEXT: kshiftrw $13, %k2, %k4 4529; AVX512BW-NEXT: korw %k4, %k3, %k3 4530; AVX512BW-NEXT: movw $-9, %ax 4531; AVX512BW-NEXT: kmovd %eax, %k1 4532; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4533; AVX512BW-NEXT: kandw %k1, %k3, %k3 4534; AVX512BW-NEXT: kshiftrw $12, %k2, %k4 4535; AVX512BW-NEXT: korw %k4, %k3, %k3 4536; AVX512BW-NEXT: movw $-17, %ax 4537; AVX512BW-NEXT: kmovd %eax, %k1 4538; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4539; AVX512BW-NEXT: kandw %k1, %k3, %k3 4540; AVX512BW-NEXT: kshiftrw $11, %k2, %k2 4541; AVX512BW-NEXT: korw %k2, %k3, %k2 4542; AVX512BW-NEXT: movw $-33, %ax 4543; AVX512BW-NEXT: kmovd %eax, %k1 4544; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4545; AVX512BW-NEXT: kandw %k1, %k2, %k2 4546; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 4547; AVX512BW-NEXT: kshiftrw $10, %k0, %k3 4548; AVX512BW-NEXT: korw %k3, %k2, %k2 4549; AVX512BW-NEXT: movw $-65, %ax 4550; AVX512BW-NEXT: kmovd %eax, %k1 4551; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4552; AVX512BW-NEXT: kandw %k1, %k2, %k2 4553; AVX512BW-NEXT: kshiftrw $9, %k0, %k3 4554; AVX512BW-NEXT: korw %k3, %k2, %k2 4555; AVX512BW-NEXT: movw $-129, %ax 4556; AVX512BW-NEXT: kmovd %eax, %k1 4557; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4558; AVX512BW-NEXT: kandw %k1, %k2, %k2 4559; AVX512BW-NEXT: kshiftrw $8, %k0, %k3 4560; AVX512BW-NEXT: korw %k3, %k2, %k2 4561; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF 4562; AVX512BW-NEXT: kmovd %eax, %k1 4563; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4564; AVX512BW-NEXT: kandw %k1, %k2, %k2 4565; AVX512BW-NEXT: kshiftrw $7, %k0, %k3 4566; AVX512BW-NEXT: korw %k3, %k2, %k2 4567; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF 4568; AVX512BW-NEXT: kmovd %eax, %k1 4569; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4570; AVX512BW-NEXT: kandw %k1, %k2, %k2 4571; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 4572; AVX512BW-NEXT: korw %k0, %k2, %k0 4573; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF 4574; AVX512BW-NEXT: kmovd %eax, %k1 4575; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4576; AVX512BW-NEXT: kandw %k1, %k0, %k3 4577; AVX512BW-NEXT: kshiftrq $2, %k5, %k0 4578; AVX512BW-NEXT: kshiftlw $15, %k0, %k2 4579; AVX512BW-NEXT: kshiftrw $5, %k2, %k4 4580; AVX512BW-NEXT: korw %k4, %k3, %k3 4581; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF 4582; AVX512BW-NEXT: kmovd %eax, %k1 4583; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4584; AVX512BW-NEXT: kandw %k1, %k3, %k3 4585; AVX512BW-NEXT: kshiftrw $4, %k2, %k4 4586; AVX512BW-NEXT: korw %k4, %k3, %k3 4587; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF 4588; AVX512BW-NEXT: kmovd %eax, %k1 4589; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4590; AVX512BW-NEXT: kandw %k1, %k3, %k3 4591; AVX512BW-NEXT: kshiftrw $3, %k2, %k7 4592; AVX512BW-NEXT: korw %k7, %k3, %k7 4593; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF 4594; AVX512BW-NEXT: kmovd %eax, %k1 4595; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4596; AVX512BW-NEXT: kandw %k1, %k7, %k7 4597; AVX512BW-NEXT: kshiftrw $2, %k2, %k2 4598; AVX512BW-NEXT: korw %k2, %k7, %k7 4599; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF 4600; AVX512BW-NEXT: kmovd %eax, %k6 4601; AVX512BW-NEXT: kandw %k6, %k7, %k7 4602; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 4603; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 4604; AVX512BW-NEXT: korw %k0, %k7, %k0 4605; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 4606; AVX512BW-NEXT: kshiftrw $1, %k0, %k1 4607; AVX512BW-NEXT: kshiftrq $3, %k5, %k7 4608; AVX512BW-NEXT: kshiftlw $15, %k7, %k0 4609; AVX512BW-NEXT: korw %k0, %k1, %k1 4610; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 4611; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4612; AVX512BW-NEXT: kandw %k2, %k7, %k1 4613; AVX512BW-NEXT: kshiftrw $14, %k0, %k7 4614; AVX512BW-NEXT: korw %k7, %k1, %k1 4615; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 4616; AVX512BW-NEXT: kandw %k3, %k1, %k1 4617; AVX512BW-NEXT: kshiftrw $13, %k0, %k7 4618; AVX512BW-NEXT: korw %k7, %k1, %k1 4619; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 4620; AVX512BW-NEXT: kandw %k3, %k1, %k1 4621; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 4622; AVX512BW-NEXT: korw %k0, %k1, %k0 4623; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4624; AVX512BW-NEXT: kandw %k1, %k0, %k0 4625; AVX512BW-NEXT: kshiftrq $4, %k5, %k1 4626; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 4627; AVX512BW-NEXT: kshiftrw $11, %k1, %k7 4628; AVX512BW-NEXT: korw %k7, %k0, %k0 4629; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 4630; AVX512BW-NEXT: kandw %k3, %k0, %k0 4631; AVX512BW-NEXT: kshiftrw $10, %k1, %k7 4632; AVX512BW-NEXT: korw %k7, %k0, %k0 4633; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 4634; AVX512BW-NEXT: kandw %k4, %k0, %k0 4635; AVX512BW-NEXT: kshiftrw $9, %k1, %k7 4636; AVX512BW-NEXT: korw %k7, %k0, %k0 4637; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 4638; AVX512BW-NEXT: kandw %k4, %k0, %k0 4639; AVX512BW-NEXT: kshiftrw $8, %k1, %k7 4640; AVX512BW-NEXT: korw %k7, %k0, %k0 4641; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 4642; AVX512BW-NEXT: kandw %k4, %k0, %k0 4643; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 4644; AVX512BW-NEXT: korw %k1, %k0, %k0 4645; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4646; AVX512BW-NEXT: kandw %k1, %k0, %k0 4647; AVX512BW-NEXT: kshiftrq $5, %k5, %k1 4648; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 4649; AVX512BW-NEXT: kshiftrw $6, %k1, %k7 4650; AVX512BW-NEXT: korw %k7, %k0, %k0 4651; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 4652; AVX512BW-NEXT: kandw %k4, %k0, %k0 4653; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 4654; AVX512BW-NEXT: korw %k7, %k0, %k0 4655; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 4656; AVX512BW-NEXT: kandw %k4, %k0, %k0 4657; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 4658; AVX512BW-NEXT: korw %k7, %k0, %k0 4659; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 4660; AVX512BW-NEXT: kandw %k7, %k0, %k0 4661; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 4662; AVX512BW-NEXT: korw %k7, %k0, %k0 4663; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 4664; AVX512BW-NEXT: kandw %k7, %k0, %k0 4665; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 4666; AVX512BW-NEXT: korw %k1, %k0, %k0 4667; AVX512BW-NEXT: kandw %k6, %k0, %k0 4668; AVX512BW-NEXT: kshiftrq $6, %k5, %k1 4669; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 4670; AVX512BW-NEXT: korw %k7, %k0, %k0 4671; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 4672; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 4673; AVX512BW-NEXT: kshiftlw $15, %k1, %k7 4674; AVX512BW-NEXT: korw %k7, %k0, %k6 4675; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k6} {z} 4676; AVX512BW-NEXT: kandw %k2, %k1, %k0 4677; AVX512BW-NEXT: kshiftrw $14, %k7, %k1 4678; AVX512BW-NEXT: korw %k1, %k0, %k0 4679; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4680; AVX512BW-NEXT: kandw %k2, %k0, %k0 4681; AVX512BW-NEXT: kshiftrw $13, %k7, %k1 4682; AVX512BW-NEXT: korw %k1, %k0, %k0 4683; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4684; AVX512BW-NEXT: kandw %k1, %k0, %k0 4685; AVX512BW-NEXT: kshiftrq $7, %k5, %k1 4686; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 4687; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 4688; AVX512BW-NEXT: korw %k6, %k0, %k0 4689; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 4690; AVX512BW-NEXT: kandw %k6, %k0, %k0 4691; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 4692; AVX512BW-NEXT: korw %k6, %k0, %k0 4693; AVX512BW-NEXT: kandw %k3, %k0, %k0 4694; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 4695; AVX512BW-NEXT: korw %k6, %k0, %k0 4696; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 4697; AVX512BW-NEXT: kandw %k3, %k0, %k0 4698; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 4699; AVX512BW-NEXT: korw %k6, %k0, %k0 4700; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 4701; AVX512BW-NEXT: kandw %k6, %k0, %k0 4702; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 4703; AVX512BW-NEXT: korw %k1, %k0, %k0 4704; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4705; AVX512BW-NEXT: kandw %k1, %k0, %k0 4706; AVX512BW-NEXT: kshiftrq $8, %k5, %k1 4707; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 4708; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 4709; AVX512BW-NEXT: korw %k6, %k0, %k0 4710; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 4711; AVX512BW-NEXT: kandw %k6, %k0, %k0 4712; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 4713; AVX512BW-NEXT: korw %k6, %k0, %k0 4714; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 4715; AVX512BW-NEXT: kandw %k6, %k0, %k0 4716; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 4717; AVX512BW-NEXT: korw %k6, %k0, %k0 4718; AVX512BW-NEXT: kandw %k4, %k0, %k0 4719; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 4720; AVX512BW-NEXT: korw %k6, %k0, %k0 4721; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 4722; AVX512BW-NEXT: kandw %k4, %k0, %k0 4723; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 4724; AVX512BW-NEXT: korw %k1, %k0, %k0 4725; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4726; AVX512BW-NEXT: kandw %k1, %k0, %k0 4727; AVX512BW-NEXT: kshiftrq $9, %k5, %k1 4728; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 4729; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 4730; AVX512BW-NEXT: korw %k7, %k0, %k0 4731; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 4732; AVX512BW-NEXT: kandw %k7, %k0, %k0 4733; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 4734; AVX512BW-NEXT: korw %k7, %k0, %k0 4735; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 4736; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 4737; AVX512BW-NEXT: korw %k6, %k0, %k7 4738; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k7} {z} 4739; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 4740; AVX512BW-NEXT: kandw %k0, %k1, %k0 4741; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 4742; AVX512BW-NEXT: korw %k1, %k0, %k0 4743; AVX512BW-NEXT: kandw %k2, %k0, %k0 4744; AVX512BW-NEXT: kshiftrq $10, %k5, %k1 4745; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 4746; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 4747; AVX512BW-NEXT: korw %k6, %k0, %k0 4748; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4749; AVX512BW-NEXT: kandw %k2, %k0, %k0 4750; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 4751; AVX512BW-NEXT: korw %k6, %k0, %k0 4752; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 4753; AVX512BW-NEXT: kandw %k6, %k0, %k0 4754; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 4755; AVX512BW-NEXT: korw %k6, %k0, %k0 4756; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 4757; AVX512BW-NEXT: kandw %k6, %k0, %k0 4758; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 4759; AVX512BW-NEXT: korw %k6, %k0, %k0 4760; AVX512BW-NEXT: kandw %k3, %k0, %k0 4761; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 4762; AVX512BW-NEXT: korw %k1, %k0, %k0 4763; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4764; AVX512BW-NEXT: kandw %k1, %k0, %k0 4765; AVX512BW-NEXT: kshiftrq $11, %k5, %k1 4766; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 4767; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 4768; AVX512BW-NEXT: korw %k6, %k0, %k0 4769; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 4770; AVX512BW-NEXT: kandw %k3, %k0, %k0 4771; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 4772; AVX512BW-NEXT: korw %k6, %k0, %k0 4773; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 4774; AVX512BW-NEXT: kandw %k6, %k0, %k0 4775; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 4776; AVX512BW-NEXT: korw %k6, %k0, %k0 4777; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 4778; AVX512BW-NEXT: kandw %k6, %k0, %k0 4779; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 4780; AVX512BW-NEXT: korw %k6, %k0, %k0 4781; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 4782; AVX512BW-NEXT: kandw %k6, %k0, %k0 4783; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 4784; AVX512BW-NEXT: korw %k1, %k0, %k0 4785; AVX512BW-NEXT: kandw %k4, %k0, %k0 4786; AVX512BW-NEXT: kshiftrq $12, %k5, %k1 4787; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 4788; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 4789; AVX512BW-NEXT: korw %k7, %k0, %k0 4790; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 4791; AVX512BW-NEXT: kandw %k4, %k0, %k0 4792; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 4793; AVX512BW-NEXT: korw %k7, %k0, %k0 4794; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 4795; AVX512BW-NEXT: kandw %k7, %k0, %k0 4796; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 4797; AVX512BW-NEXT: korw %k7, %k0, %k0 4798; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 4799; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 4800; AVX512BW-NEXT: korw %k6, %k0, %k6 4801; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k6} {z} 4802; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 4803; AVX512BW-NEXT: kandw %k0, %k1, %k0 4804; AVX512BW-NEXT: kshiftrq $13, %k5, %k1 4805; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 4806; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 4807; AVX512BW-NEXT: korw %k6, %k0, %k0 4808; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 4809; AVX512BW-NEXT: kandw %k6, %k0, %k0 4810; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 4811; AVX512BW-NEXT: korw %k6, %k0, %k0 4812; AVX512BW-NEXT: kandw %k2, %k0, %k0 4813; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 4814; AVX512BW-NEXT: korw %k6, %k0, %k0 4815; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4816; AVX512BW-NEXT: kandw %k2, %k0, %k0 4817; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 4818; AVX512BW-NEXT: korw %k6, %k0, %k0 4819; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 4820; AVX512BW-NEXT: kandw %k6, %k0, %k0 4821; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 4822; AVX512BW-NEXT: korw %k1, %k0, %k0 4823; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4824; AVX512BW-NEXT: kandw %k1, %k0, %k0 4825; AVX512BW-NEXT: kshiftrq $14, %k5, %k1 4826; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 4827; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 4828; AVX512BW-NEXT: korw %k6, %k0, %k0 4829; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 4830; AVX512BW-NEXT: kandw %k6, %k0, %k0 4831; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 4832; AVX512BW-NEXT: korw %k6, %k0, %k0 4833; AVX512BW-NEXT: kandw %k3, %k0, %k0 4834; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 4835; AVX512BW-NEXT: korw %k6, %k0, %k0 4836; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 4837; AVX512BW-NEXT: kandw %k3, %k0, %k0 4838; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 4839; AVX512BW-NEXT: korw %k6, %k0, %k0 4840; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 4841; AVX512BW-NEXT: kandw %k6, %k0, %k0 4842; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 4843; AVX512BW-NEXT: korw %k1, %k0, %k0 4844; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4845; AVX512BW-NEXT: kandw %k1, %k0, %k0 4846; AVX512BW-NEXT: kshiftrq $15, %k5, %k1 4847; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 4848; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 4849; AVX512BW-NEXT: korw %k7, %k0, %k0 4850; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 4851; AVX512BW-NEXT: kandw %k7, %k0, %k0 4852; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 4853; AVX512BW-NEXT: korw %k7, %k0, %k0 4854; AVX512BW-NEXT: kandw %k4, %k0, %k0 4855; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 4856; AVX512BW-NEXT: korw %k7, %k0, %k0 4857; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 4858; AVX512BW-NEXT: kandw %k4, %k0, %k0 4859; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 4860; AVX512BW-NEXT: korw %k1, %k0, %k0 4861; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 4862; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 4863; AVX512BW-NEXT: korw %k6, %k0, %k1 4864; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} 4865; AVX512BW-NEXT: kshiftrq $16, %k5, %k0 4866; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4867; AVX512BW-NEXT: kandw %k1, %k0, %k1 4868; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 4869; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 4870; AVX512BW-NEXT: korw %k6, %k1, %k1 4871; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 4872; AVX512BW-NEXT: kandw %k6, %k1, %k1 4873; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 4874; AVX512BW-NEXT: korw %k6, %k1, %k1 4875; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 4876; AVX512BW-NEXT: kandw %k6, %k1, %k1 4877; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 4878; AVX512BW-NEXT: korw %k6, %k1, %k1 4879; AVX512BW-NEXT: kandw %k2, %k1, %k1 4880; AVX512BW-NEXT: kshiftrw $11, %k0, %k0 4881; AVX512BW-NEXT: korw %k0, %k1, %k0 4882; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4883; AVX512BW-NEXT: kandw %k1, %k0, %k0 4884; AVX512BW-NEXT: kshiftrq $17, %k5, %k1 4885; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 4886; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 4887; AVX512BW-NEXT: korw %k6, %k0, %k0 4888; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4889; AVX512BW-NEXT: kandw %k2, %k0, %k0 4890; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 4891; AVX512BW-NEXT: korw %k6, %k0, %k0 4892; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4893; AVX512BW-NEXT: kandw %k2, %k0, %k0 4894; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 4895; AVX512BW-NEXT: korw %k6, %k0, %k0 4896; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 4897; AVX512BW-NEXT: kandw %k6, %k0, %k0 4898; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 4899; AVX512BW-NEXT: korw %k6, %k0, %k0 4900; AVX512BW-NEXT: kandw %k3, %k0, %k0 4901; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 4902; AVX512BW-NEXT: korw %k1, %k0, %k0 4903; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 4904; AVX512BW-NEXT: kandw %k3, %k0, %k0 4905; AVX512BW-NEXT: kshiftrq $18, %k5, %k1 4906; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 4907; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 4908; AVX512BW-NEXT: korw %k7, %k0, %k0 4909; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 4910; AVX512BW-NEXT: kandw %k7, %k0, %k0 4911; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 4912; AVX512BW-NEXT: korw %k7, %k0, %k0 4913; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 4914; AVX512BW-NEXT: kandw %k7, %k0, %k0 4915; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 4916; AVX512BW-NEXT: korw %k7, %k0, %k0 4917; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 4918; AVX512BW-NEXT: kandw %k7, %k0, %k0 4919; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 4920; AVX512BW-NEXT: korw %k6, %k0, %k0 4921; AVX512BW-NEXT: kandw %k4, %k0, %k0 4922; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 4923; AVX512BW-NEXT: korw %k1, %k0, %k0 4924; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 4925; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 4926; AVX512BW-NEXT: kshiftrq $19, %k5, %k1 4927; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 4928; AVX512BW-NEXT: korw %k6, %k0, %k7 4929; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k7} {z} 4930; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 4931; AVX512BW-NEXT: kandw %k0, %k1, %k0 4932; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 4933; AVX512BW-NEXT: korw %k1, %k0, %k0 4934; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4935; AVX512BW-NEXT: kandw %k1, %k0, %k0 4936; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 4937; AVX512BW-NEXT: korw %k1, %k0, %k0 4938; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4939; AVX512BW-NEXT: kandw %k1, %k0, %k0 4940; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 4941; AVX512BW-NEXT: korw %k1, %k0, %k0 4942; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4943; AVX512BW-NEXT: kandw %k1, %k0, %k0 4944; AVX512BW-NEXT: kshiftrq $20, %k5, %k1 4945; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 4946; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 4947; AVX512BW-NEXT: korw %k6, %k0, %k0 4948; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 4949; AVX512BW-NEXT: kandw %k4, %k0, %k0 4950; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 4951; AVX512BW-NEXT: korw %k6, %k0, %k0 4952; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 4953; AVX512BW-NEXT: kandw %k4, %k0, %k0 4954; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 4955; AVX512BW-NEXT: korw %k6, %k0, %k0 4956; AVX512BW-NEXT: kandw %k2, %k0, %k0 4957; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 4958; AVX512BW-NEXT: korw %k6, %k0, %k0 4959; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4960; AVX512BW-NEXT: kandw %k2, %k0, %k0 4961; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 4962; AVX512BW-NEXT: korw %k1, %k0, %k0 4963; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4964; AVX512BW-NEXT: kandw %k1, %k0, %k0 4965; AVX512BW-NEXT: kshiftrq $21, %k5, %k1 4966; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 4967; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 4968; AVX512BW-NEXT: korw %k6, %k0, %k0 4969; AVX512BW-NEXT: kandw %k3, %k0, %k0 4970; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 4971; AVX512BW-NEXT: korw %k6, %k0, %k0 4972; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 4973; AVX512BW-NEXT: kandw %k4, %k0, %k0 4974; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 4975; AVX512BW-NEXT: korw %k6, %k0, %k0 4976; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 4977; AVX512BW-NEXT: kandw %k2, %k0, %k0 4978; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 4979; AVX512BW-NEXT: korw %k6, %k0, %k0 4980; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 4981; AVX512BW-NEXT: kandw %k3, %k0, %k0 4982; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 4983; AVX512BW-NEXT: korw %k1, %k0, %k0 4984; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 4985; AVX512BW-NEXT: kandw %k1, %k0, %k0 4986; AVX512BW-NEXT: kshiftrq $22, %k5, %k1 4987; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 4988; AVX512BW-NEXT: korw %k6, %k0, %k0 4989; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 4990; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 4991; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 4992; AVX512BW-NEXT: korw %k6, %k0, %k7 4993; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k7} {z} 4994; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 4995; AVX512BW-NEXT: kandw %k0, %k1, %k0 4996; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 4997; AVX512BW-NEXT: korw %k1, %k0, %k0 4998; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 4999; AVX512BW-NEXT: kandw %k3, %k0, %k0 5000; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 5001; AVX512BW-NEXT: korw %k1, %k0, %k0 5002; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5003; AVX512BW-NEXT: kandw %k1, %k0, %k0 5004; AVX512BW-NEXT: kshiftrq $23, %k5, %k1 5005; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 5006; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 5007; AVX512BW-NEXT: korw %k6, %k0, %k0 5008; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5009; AVX512BW-NEXT: kandw %k6, %k0, %k0 5010; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 5011; AVX512BW-NEXT: korw %k6, %k0, %k0 5012; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5013; AVX512BW-NEXT: kandw %k6, %k0, %k0 5014; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 5015; AVX512BW-NEXT: korw %k6, %k0, %k0 5016; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5017; AVX512BW-NEXT: kandw %k6, %k0, %k0 5018; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 5019; AVX512BW-NEXT: korw %k6, %k0, %k0 5020; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5021; AVX512BW-NEXT: kandw %k6, %k0, %k0 5022; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 5023; AVX512BW-NEXT: korw %k1, %k0, %k0 5024; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5025; AVX512BW-NEXT: kandw %k1, %k0, %k0 5026; AVX512BW-NEXT: kshiftrq $24, %k5, %k1 5027; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 5028; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 5029; AVX512BW-NEXT: korw %k6, %k0, %k0 5030; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5031; AVX512BW-NEXT: kandw %k6, %k0, %k0 5032; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 5033; AVX512BW-NEXT: korw %k6, %k0, %k0 5034; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5035; AVX512BW-NEXT: kandw %k6, %k0, %k0 5036; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 5037; AVX512BW-NEXT: korw %k6, %k0, %k0 5038; AVX512BW-NEXT: kandw %k4, %k0, %k0 5039; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 5040; AVX512BW-NEXT: korw %k6, %k0, %k0 5041; AVX512BW-NEXT: kandw %k2, %k0, %k0 5042; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 5043; AVX512BW-NEXT: korw %k1, %k0, %k0 5044; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5045; AVX512BW-NEXT: kandw %k1, %k0, %k0 5046; AVX512BW-NEXT: kshiftrq $25, %k5, %k1 5047; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 5048; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 5049; AVX512BW-NEXT: korw %k7, %k0, %k0 5050; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5051; AVX512BW-NEXT: kandw %k2, %k0, %k0 5052; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 5053; AVX512BW-NEXT: korw %k7, %k0, %k0 5054; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 5055; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 5056; AVX512BW-NEXT: korw %k6, %k0, %k7 5057; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k7} {z} 5058; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5059; AVX512BW-NEXT: kandw %k2, %k1, %k0 5060; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 5061; AVX512BW-NEXT: korw %k1, %k0, %k0 5062; AVX512BW-NEXT: kandw %k3, %k0, %k0 5063; AVX512BW-NEXT: kshiftrq $26, %k5, %k1 5064; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 5065; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 5066; AVX512BW-NEXT: korw %k6, %k0, %k0 5067; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 5068; AVX512BW-NEXT: kandw %k3, %k0, %k0 5069; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 5070; AVX512BW-NEXT: korw %k6, %k0, %k0 5071; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 5072; AVX512BW-NEXT: kandw %k3, %k0, %k0 5073; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 5074; AVX512BW-NEXT: korw %k6, %k0, %k0 5075; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 5076; AVX512BW-NEXT: kandw %k3, %k0, %k0 5077; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 5078; AVX512BW-NEXT: korw %k6, %k0, %k0 5079; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 5080; AVX512BW-NEXT: kandw %k4, %k0, %k0 5081; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 5082; AVX512BW-NEXT: korw %k1, %k0, %k0 5083; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5084; AVX512BW-NEXT: kandw %k1, %k0, %k0 5085; AVX512BW-NEXT: kshiftrq $27, %k5, %k1 5086; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 5087; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 5088; AVX512BW-NEXT: korw %k6, %k0, %k0 5089; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5090; AVX512BW-NEXT: kandw %k6, %k0, %k0 5091; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 5092; AVX512BW-NEXT: korw %k6, %k0, %k0 5093; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5094; AVX512BW-NEXT: kandw %k6, %k0, %k0 5095; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 5096; AVX512BW-NEXT: korw %k6, %k0, %k0 5097; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5098; AVX512BW-NEXT: kandw %k6, %k0, %k0 5099; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 5100; AVX512BW-NEXT: korw %k6, %k0, %k0 5101; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5102; AVX512BW-NEXT: kandw %k6, %k0, %k0 5103; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 5104; AVX512BW-NEXT: korw %k1, %k0, %k0 5105; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5106; AVX512BW-NEXT: kandw %k1, %k0, %k0 5107; AVX512BW-NEXT: kshiftrq $28, %k5, %k1 5108; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 5109; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 5110; AVX512BW-NEXT: korw %k7, %k0, %k0 5111; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 5112; AVX512BW-NEXT: kandw %k7, %k0, %k0 5113; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 5114; AVX512BW-NEXT: korw %k7, %k0, %k0 5115; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 5116; AVX512BW-NEXT: kandw %k7, %k0, %k0 5117; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 5118; AVX512BW-NEXT: korw %k7, %k0, %k0 5119; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 5120; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 5121; AVX512BW-NEXT: korw %k6, %k0, %k6 5122; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k6} {z} 5123; AVX512BW-NEXT: kandw %k2, %k1, %k0 5124; AVX512BW-NEXT: kshiftrq $29, %k5, %k1 5125; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 5126; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 5127; AVX512BW-NEXT: korw %k6, %k0, %k0 5128; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5129; AVX512BW-NEXT: kandw %k2, %k0, %k0 5130; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 5131; AVX512BW-NEXT: korw %k6, %k0, %k0 5132; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5133; AVX512BW-NEXT: kandw %k6, %k0, %k0 5134; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 5135; AVX512BW-NEXT: korw %k6, %k0, %k0 5136; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5137; AVX512BW-NEXT: kandw %k6, %k0, %k0 5138; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 5139; AVX512BW-NEXT: korw %k6, %k0, %k0 5140; AVX512BW-NEXT: kandw %k3, %k0, %k0 5141; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 5142; AVX512BW-NEXT: korw %k1, %k0, %k0 5143; AVX512BW-NEXT: kandw %k4, %k0, %k0 5144; AVX512BW-NEXT: kshiftrq $30, %k5, %k1 5145; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 5146; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 5147; AVX512BW-NEXT: korw %k6, %k0, %k0 5148; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 5149; AVX512BW-NEXT: kandw %k3, %k0, %k0 5150; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 5151; AVX512BW-NEXT: korw %k6, %k0, %k0 5152; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 5153; AVX512BW-NEXT: kandw %k3, %k0, %k0 5154; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 5155; AVX512BW-NEXT: korw %k6, %k0, %k0 5156; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 5157; AVX512BW-NEXT: kandw %k4, %k0, %k0 5158; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 5159; AVX512BW-NEXT: korw %k6, %k0, %k0 5160; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 5161; AVX512BW-NEXT: kandw %k4, %k0, %k0 5162; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 5163; AVX512BW-NEXT: korw %k1, %k0, %k0 5164; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5165; AVX512BW-NEXT: kandw %k1, %k0, %k0 5166; AVX512BW-NEXT: kshiftrq $31, %k5, %k1 5167; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 5168; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 5169; AVX512BW-NEXT: korw %k7, %k0, %k0 5170; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 5171; AVX512BW-NEXT: kandw %k4, %k0, %k0 5172; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 5173; AVX512BW-NEXT: korw %k7, %k0, %k0 5174; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 5175; AVX512BW-NEXT: kandw %k4, %k0, %k0 5176; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 5177; AVX512BW-NEXT: korw %k7, %k0, %k0 5178; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 5179; AVX512BW-NEXT: kandw %k4, %k0, %k0 5180; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 5181; AVX512BW-NEXT: korw %k1, %k0, %k0 5182; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 5183; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 5184; AVX512BW-NEXT: korw %k6, %k0, %k1 5185; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} 5186; AVX512BW-NEXT: kshiftrq $32, %k5, %k0 5187; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5188; AVX512BW-NEXT: kandw %k1, %k0, %k1 5189; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 5190; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 5191; AVX512BW-NEXT: korw %k6, %k1, %k1 5192; AVX512BW-NEXT: kandw %k2, %k1, %k1 5193; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 5194; AVX512BW-NEXT: korw %k6, %k1, %k1 5195; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5196; AVX512BW-NEXT: kandw %k2, %k1, %k1 5197; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 5198; AVX512BW-NEXT: korw %k6, %k1, %k1 5199; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 5200; AVX512BW-NEXT: kandw %k4, %k1, %k1 5201; AVX512BW-NEXT: kshiftrw $11, %k0, %k0 5202; AVX512BW-NEXT: korw %k0, %k1, %k0 5203; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5204; AVX512BW-NEXT: kandw %k1, %k0, %k0 5205; AVX512BW-NEXT: kshiftrq $33, %k5, %k1 5206; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 5207; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 5208; AVX512BW-NEXT: korw %k6, %k0, %k0 5209; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5210; AVX512BW-NEXT: kandw %k6, %k0, %k0 5211; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 5212; AVX512BW-NEXT: korw %k6, %k0, %k0 5213; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5214; AVX512BW-NEXT: kandw %k6, %k0, %k0 5215; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 5216; AVX512BW-NEXT: korw %k6, %k0, %k0 5217; AVX512BW-NEXT: kandw %k3, %k0, %k0 5218; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 5219; AVX512BW-NEXT: korw %k6, %k0, %k0 5220; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 5221; AVX512BW-NEXT: kandw %k3, %k0, %k0 5222; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 5223; AVX512BW-NEXT: korw %k1, %k0, %k0 5224; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5225; AVX512BW-NEXT: kandw %k1, %k0, %k0 5226; AVX512BW-NEXT: kshiftrq $34, %k5, %k1 5227; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 5228; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 5229; AVX512BW-NEXT: korw %k7, %k0, %k0 5230; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 5231; AVX512BW-NEXT: kandw %k3, %k0, %k0 5232; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 5233; AVX512BW-NEXT: korw %k7, %k0, %k0 5234; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 5235; AVX512BW-NEXT: kandw %k3, %k0, %k0 5236; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 5237; AVX512BW-NEXT: korw %k7, %k0, %k0 5238; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 5239; AVX512BW-NEXT: kandw %k3, %k0, %k0 5240; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 5241; AVX512BW-NEXT: korw %k6, %k0, %k0 5242; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5243; AVX512BW-NEXT: kandw %k6, %k0, %k0 5244; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 5245; AVX512BW-NEXT: korw %k1, %k0, %k0 5246; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 5247; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 5248; AVX512BW-NEXT: kshiftrq $35, %k5, %k1 5249; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 5250; AVX512BW-NEXT: korw %k6, %k0, %k7 5251; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k7} {z} 5252; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 5253; AVX512BW-NEXT: kandw %k0, %k1, %k0 5254; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 5255; AVX512BW-NEXT: korw %k1, %k0, %k0 5256; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5257; AVX512BW-NEXT: kandw %k1, %k0, %k0 5258; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 5259; AVX512BW-NEXT: korw %k1, %k0, %k0 5260; AVX512BW-NEXT: kandw %k2, %k0, %k0 5261; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 5262; AVX512BW-NEXT: korw %k1, %k0, %k0 5263; AVX512BW-NEXT: kandw %k4, %k0, %k0 5264; AVX512BW-NEXT: kshiftrq $36, %k5, %k1 5265; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 5266; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 5267; AVX512BW-NEXT: korw %k6, %k0, %k0 5268; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5269; AVX512BW-NEXT: kandw %k2, %k0, %k0 5270; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 5271; AVX512BW-NEXT: korw %k6, %k0, %k0 5272; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 5273; AVX512BW-NEXT: kandw %k4, %k0, %k0 5274; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 5275; AVX512BW-NEXT: korw %k6, %k0, %k0 5276; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5277; AVX512BW-NEXT: kandw %k6, %k0, %k0 5278; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 5279; AVX512BW-NEXT: korw %k6, %k0, %k0 5280; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5281; AVX512BW-NEXT: kandw %k6, %k0, %k0 5282; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 5283; AVX512BW-NEXT: korw %k1, %k0, %k0 5284; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5285; AVX512BW-NEXT: kandw %k1, %k0, %k0 5286; AVX512BW-NEXT: kshiftrq $37, %k5, %k1 5287; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 5288; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 5289; AVX512BW-NEXT: korw %k6, %k0, %k0 5290; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5291; AVX512BW-NEXT: kandw %k6, %k0, %k0 5292; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 5293; AVX512BW-NEXT: korw %k6, %k0, %k0 5294; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5295; AVX512BW-NEXT: kandw %k6, %k0, %k0 5296; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 5297; AVX512BW-NEXT: korw %k6, %k0, %k0 5298; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5299; AVX512BW-NEXT: kandw %k6, %k0, %k0 5300; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 5301; AVX512BW-NEXT: korw %k6, %k0, %k0 5302; AVX512BW-NEXT: kandw %k3, %k0, %k0 5303; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 5304; AVX512BW-NEXT: korw %k1, %k0, %k0 5305; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5306; AVX512BW-NEXT: kandw %k1, %k0, %k0 5307; AVX512BW-NEXT: kshiftrq $38, %k5, %k1 5308; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 5309; AVX512BW-NEXT: korw %k6, %k0, %k0 5310; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 5311; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 5312; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 5313; AVX512BW-NEXT: korw %k6, %k0, %k7 5314; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k7} {z} 5315; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 5316; AVX512BW-NEXT: kandw %k3, %k1, %k0 5317; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 5318; AVX512BW-NEXT: korw %k1, %k0, %k0 5319; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5320; AVX512BW-NEXT: kandw %k1, %k0, %k0 5321; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 5322; AVX512BW-NEXT: korw %k1, %k0, %k0 5323; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5324; AVX512BW-NEXT: kandw %k1, %k0, %k0 5325; AVX512BW-NEXT: kshiftrq $39, %k5, %k1 5326; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 5327; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 5328; AVX512BW-NEXT: korw %k6, %k0, %k0 5329; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5330; AVX512BW-NEXT: kandw %k6, %k0, %k0 5331; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 5332; AVX512BW-NEXT: korw %k6, %k0, %k0 5333; AVX512BW-NEXT: kandw %k2, %k0, %k0 5334; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 5335; AVX512BW-NEXT: korw %k6, %k0, %k0 5336; AVX512BW-NEXT: kandw %k4, %k0, %k0 5337; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 5338; AVX512BW-NEXT: korw %k6, %k0, %k0 5339; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 5340; AVX512BW-NEXT: kandw %k4, %k0, %k0 5341; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 5342; AVX512BW-NEXT: korw %k1, %k0, %k0 5343; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5344; AVX512BW-NEXT: kandw %k1, %k0, %k0 5345; AVX512BW-NEXT: kshiftrq $40, %k5, %k1 5346; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 5347; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 5348; AVX512BW-NEXT: korw %k6, %k0, %k0 5349; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5350; AVX512BW-NEXT: kandw %k2, %k0, %k0 5351; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 5352; AVX512BW-NEXT: korw %k6, %k0, %k0 5353; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5354; AVX512BW-NEXT: kandw %k6, %k0, %k0 5355; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 5356; AVX512BW-NEXT: korw %k6, %k0, %k0 5357; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5358; AVX512BW-NEXT: kandw %k6, %k0, %k0 5359; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 5360; AVX512BW-NEXT: korw %k6, %k0, %k0 5361; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5362; AVX512BW-NEXT: kandw %k6, %k0, %k0 5363; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 5364; AVX512BW-NEXT: korw %k1, %k0, %k0 5365; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5366; AVX512BW-NEXT: kandw %k1, %k0, %k0 5367; AVX512BW-NEXT: kshiftrq $41, %k5, %k1 5368; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 5369; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 5370; AVX512BW-NEXT: korw %k7, %k0, %k0 5371; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 5372; AVX512BW-NEXT: kandw %k7, %k0, %k0 5373; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 5374; AVX512BW-NEXT: korw %k7, %k0, %k0 5375; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 5376; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 5377; AVX512BW-NEXT: korw %k6, %k0, %k7 5378; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k7} {z} 5379; AVX512BW-NEXT: kandw %k3, %k1, %k0 5380; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 5381; AVX512BW-NEXT: korw %k1, %k0, %k0 5382; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5383; AVX512BW-NEXT: kandw %k1, %k0, %k0 5384; AVX512BW-NEXT: kshiftrq $42, %k5, %k1 5385; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 5386; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 5387; AVX512BW-NEXT: korw %k6, %k0, %k0 5388; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 5389; AVX512BW-NEXT: kandw %k3, %k0, %k0 5390; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 5391; AVX512BW-NEXT: korw %k6, %k0, %k0 5392; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 5393; AVX512BW-NEXT: kandw %k3, %k0, %k0 5394; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 5395; AVX512BW-NEXT: korw %k6, %k0, %k0 5396; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 5397; AVX512BW-NEXT: kandw %k3, %k0, %k0 5398; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 5399; AVX512BW-NEXT: korw %k6, %k0, %k0 5400; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 5401; AVX512BW-NEXT: kandw %k3, %k0, %k0 5402; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 5403; AVX512BW-NEXT: korw %k1, %k0, %k0 5404; AVX512BW-NEXT: kandw %k4, %k0, %k0 5405; AVX512BW-NEXT: kshiftrq $43, %k5, %k1 5406; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 5407; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 5408; AVX512BW-NEXT: korw %k6, %k0, %k0 5409; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 5410; AVX512BW-NEXT: kandw %k4, %k0, %k0 5411; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 5412; AVX512BW-NEXT: korw %k6, %k0, %k0 5413; AVX512BW-NEXT: kandw %k2, %k0, %k0 5414; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 5415; AVX512BW-NEXT: korw %k6, %k0, %k0 5416; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 5417; AVX512BW-NEXT: kandw %k3, %k0, %k0 5418; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 5419; AVX512BW-NEXT: korw %k6, %k0, %k0 5420; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5421; AVX512BW-NEXT: kandw %k2, %k0, %k0 5422; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 5423; AVX512BW-NEXT: korw %k1, %k0, %k0 5424; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5425; AVX512BW-NEXT: kandw %k1, %k0, %k0 5426; AVX512BW-NEXT: kshiftrq $44, %k5, %k1 5427; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 5428; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 5429; AVX512BW-NEXT: korw %k7, %k0, %k0 5430; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5431; AVX512BW-NEXT: kandw %k2, %k0, %k0 5432; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 5433; AVX512BW-NEXT: korw %k7, %k0, %k0 5434; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5435; AVX512BW-NEXT: kandw %k2, %k0, %k0 5436; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 5437; AVX512BW-NEXT: korw %k7, %k0, %k0 5438; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 5439; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 5440; AVX512BW-NEXT: korw %k6, %k0, %k6 5441; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k6} {z} 5442; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 5443; AVX512BW-NEXT: kandw %k0, %k1, %k0 5444; AVX512BW-NEXT: kshiftrq $45, %k5, %k1 5445; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 5446; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 5447; AVX512BW-NEXT: korw %k6, %k0, %k0 5448; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5449; AVX512BW-NEXT: kandw %k6, %k0, %k0 5450; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 5451; AVX512BW-NEXT: korw %k6, %k0, %k0 5452; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5453; AVX512BW-NEXT: kandw %k6, %k0, %k0 5454; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 5455; AVX512BW-NEXT: korw %k6, %k0, %k0 5456; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5457; AVX512BW-NEXT: kandw %k6, %k0, %k0 5458; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 5459; AVX512BW-NEXT: korw %k6, %k0, %k0 5460; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5461; AVX512BW-NEXT: kandw %k6, %k0, %k0 5462; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 5463; AVX512BW-NEXT: korw %k1, %k0, %k0 5464; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5465; AVX512BW-NEXT: kandw %k1, %k0, %k0 5466; AVX512BW-NEXT: kshiftrq $46, %k5, %k1 5467; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 5468; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 5469; AVX512BW-NEXT: korw %k6, %k0, %k0 5470; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5471; AVX512BW-NEXT: kandw %k6, %k0, %k0 5472; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 5473; AVX512BW-NEXT: korw %k6, %k0, %k0 5474; AVX512BW-NEXT: kandw %k4, %k0, %k0 5475; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 5476; AVX512BW-NEXT: korw %k6, %k0, %k0 5477; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 5478; AVX512BW-NEXT: kandw %k4, %k0, %k0 5479; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 5480; AVX512BW-NEXT: korw %k6, %k0, %k0 5481; AVX512BW-NEXT: kandw %k3, %k0, %k0 5482; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 5483; AVX512BW-NEXT: korw %k1, %k0, %k0 5484; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 5485; AVX512BW-NEXT: kandw %k4, %k0, %k0 5486; AVX512BW-NEXT: kshiftrq $47, %k5, %k1 5487; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 5488; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 5489; AVX512BW-NEXT: korw %k7, %k0, %k0 5490; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 5491; AVX512BW-NEXT: kandw %k3, %k0, %k0 5492; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 5493; AVX512BW-NEXT: korw %k7, %k0, %k0 5494; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 5495; AVX512BW-NEXT: kandw %k7, %k0, %k0 5496; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 5497; AVX512BW-NEXT: korw %k7, %k0, %k0 5498; AVX512BW-NEXT: kandw %k2, %k0, %k0 5499; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 5500; AVX512BW-NEXT: korw %k1, %k0, %k0 5501; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 5502; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 5503; AVX512BW-NEXT: korw %k6, %k0, %k1 5504; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k1} {z} 5505; AVX512BW-NEXT: kshiftrq $48, %k5, %k0 5506; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5507; AVX512BW-NEXT: kandw %k1, %k0, %k1 5508; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 5509; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 5510; AVX512BW-NEXT: korw %k6, %k1, %k1 5511; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5512; AVX512BW-NEXT: kandw %k2, %k1, %k1 5513; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 5514; AVX512BW-NEXT: korw %k6, %k1, %k1 5515; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5516; AVX512BW-NEXT: kandw %k6, %k1, %k1 5517; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 5518; AVX512BW-NEXT: korw %k6, %k1, %k1 5519; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5520; AVX512BW-NEXT: kandw %k6, %k1, %k1 5521; AVX512BW-NEXT: kshiftrw $11, %k0, %k0 5522; AVX512BW-NEXT: korw %k0, %k1, %k0 5523; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5524; AVX512BW-NEXT: kandw %k1, %k0, %k0 5525; AVX512BW-NEXT: kshiftrq $49, %k5, %k1 5526; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 5527; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 5528; AVX512BW-NEXT: korw %k6, %k0, %k0 5529; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5530; AVX512BW-NEXT: kandw %k6, %k0, %k0 5531; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 5532; AVX512BW-NEXT: korw %k6, %k0, %k0 5533; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5534; AVX512BW-NEXT: kandw %k6, %k0, %k0 5535; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 5536; AVX512BW-NEXT: korw %k6, %k0, %k0 5537; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5538; AVX512BW-NEXT: kandw %k6, %k0, %k0 5539; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 5540; AVX512BW-NEXT: korw %k6, %k0, %k0 5541; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5542; AVX512BW-NEXT: kandw %k6, %k0, %k0 5543; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 5544; AVX512BW-NEXT: korw %k1, %k0, %k0 5545; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5546; AVX512BW-NEXT: kandw %k1, %k0, %k0 5547; AVX512BW-NEXT: kshiftrq $50, %k5, %k1 5548; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 5549; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 5550; AVX512BW-NEXT: korw %k7, %k0, %k0 5551; AVX512BW-NEXT: kandw %k4, %k0, %k0 5552; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 5553; AVX512BW-NEXT: korw %k7, %k0, %k0 5554; AVX512BW-NEXT: kandw %k3, %k0, %k0 5555; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 5556; AVX512BW-NEXT: korw %k7, %k0, %k0 5557; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 5558; AVX512BW-NEXT: kandw %k3, %k0, %k0 5559; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 5560; AVX512BW-NEXT: korw %k6, %k0, %k0 5561; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 5562; AVX512BW-NEXT: kandw %k3, %k0, %k0 5563; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 5564; AVX512BW-NEXT: korw %k1, %k0, %k0 5565; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 5566; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 5567; AVX512BW-NEXT: kshiftrq $51, %k5, %k1 5568; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 5569; AVX512BW-NEXT: korw %k6, %k0, %k7 5570; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k7} {z} 5571; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 5572; AVX512BW-NEXT: kandw %k0, %k1, %k0 5573; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 5574; AVX512BW-NEXT: korw %k1, %k0, %k0 5575; AVX512BW-NEXT: kandw %k2, %k0, %k0 5576; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 5577; AVX512BW-NEXT: korw %k1, %k0, %k0 5578; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 5579; AVX512BW-NEXT: kandw %k4, %k0, %k0 5580; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 5581; AVX512BW-NEXT: korw %k1, %k0, %k0 5582; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5583; AVX512BW-NEXT: kandw %k1, %k0, %k0 5584; AVX512BW-NEXT: kshiftrq $52, %k5, %k1 5585; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 5586; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 5587; AVX512BW-NEXT: korw %k6, %k0, %k0 5588; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5589; AVX512BW-NEXT: kandw %k2, %k0, %k0 5590; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 5591; AVX512BW-NEXT: korw %k6, %k0, %k0 5592; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5593; AVX512BW-NEXT: kandw %k2, %k0, %k0 5594; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 5595; AVX512BW-NEXT: korw %k6, %k0, %k0 5596; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5597; AVX512BW-NEXT: kandw %k2, %k0, %k0 5598; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 5599; AVX512BW-NEXT: korw %k6, %k0, %k0 5600; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5601; AVX512BW-NEXT: kandw %k2, %k0, %k0 5602; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 5603; AVX512BW-NEXT: korw %k1, %k0, %k0 5604; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5605; AVX512BW-NEXT: kandw %k1, %k0, %k0 5606; AVX512BW-NEXT: kshiftrq $53, %k5, %k1 5607; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 5608; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 5609; AVX512BW-NEXT: korw %k6, %k0, %k0 5610; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5611; AVX512BW-NEXT: kandw %k2, %k0, %k0 5612; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 5613; AVX512BW-NEXT: korw %k6, %k0, %k0 5614; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 5615; AVX512BW-NEXT: kandw %k3, %k0, %k0 5616; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 5617; AVX512BW-NEXT: korw %k6, %k0, %k0 5618; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5619; AVX512BW-NEXT: kandw %k2, %k0, %k0 5620; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 5621; AVX512BW-NEXT: korw %k6, %k0, %k0 5622; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5623; AVX512BW-NEXT: kandw %k2, %k0, %k0 5624; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 5625; AVX512BW-NEXT: korw %k1, %k0, %k0 5626; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5627; AVX512BW-NEXT: kandw %k1, %k0, %k0 5628; AVX512BW-NEXT: kshiftrq $54, %k5, %k1 5629; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 5630; AVX512BW-NEXT: korw %k6, %k0, %k0 5631; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 5632; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 5633; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 5634; AVX512BW-NEXT: korw %k6, %k0, %k7 5635; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k7} {z} 5636; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 5637; AVX512BW-NEXT: kandw %k0, %k1, %k0 5638; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 5639; AVX512BW-NEXT: korw %k1, %k0, %k0 5640; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5641; AVX512BW-NEXT: kandw %k1, %k0, %k0 5642; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 5643; AVX512BW-NEXT: korw %k1, %k0, %k0 5644; AVX512BW-NEXT: kandw %k4, %k0, %k0 5645; AVX512BW-NEXT: kshiftrq $55, %k5, %k1 5646; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 5647; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 5648; AVX512BW-NEXT: korw %k6, %k0, %k0 5649; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 5650; AVX512BW-NEXT: kandw %k4, %k0, %k0 5651; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 5652; AVX512BW-NEXT: korw %k6, %k0, %k0 5653; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5654; AVX512BW-NEXT: kandw %k6, %k0, %k0 5655; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 5656; AVX512BW-NEXT: korw %k6, %k0, %k0 5657; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5658; AVX512BW-NEXT: kandw %k6, %k0, %k0 5659; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 5660; AVX512BW-NEXT: korw %k6, %k0, %k0 5661; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5662; AVX512BW-NEXT: kandw %k6, %k0, %k0 5663; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 5664; AVX512BW-NEXT: korw %k1, %k0, %k0 5665; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5666; AVX512BW-NEXT: kandw %k1, %k0, %k0 5667; AVX512BW-NEXT: kshiftrq $56, %k5, %k1 5668; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 5669; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 5670; AVX512BW-NEXT: korw %k6, %k0, %k0 5671; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5672; AVX512BW-NEXT: kandw %k6, %k0, %k0 5673; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 5674; AVX512BW-NEXT: korw %k6, %k0, %k0 5675; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5676; AVX512BW-NEXT: kandw %k6, %k0, %k0 5677; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 5678; AVX512BW-NEXT: korw %k6, %k0, %k0 5679; AVX512BW-NEXT: kandw %k3, %k0, %k0 5680; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 5681; AVX512BW-NEXT: korw %k6, %k0, %k0 5682; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 5683; AVX512BW-NEXT: kandw %k3, %k0, %k0 5684; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 5685; AVX512BW-NEXT: korw %k1, %k0, %k0 5686; AVX512BW-NEXT: kandw %k2, %k0, %k0 5687; AVX512BW-NEXT: kshiftrq $57, %k5, %k1 5688; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 5689; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 5690; AVX512BW-NEXT: korw %k7, %k0, %k0 5691; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5692; AVX512BW-NEXT: kandw %k2, %k0, %k0 5693; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 5694; AVX512BW-NEXT: korw %k7, %k0, %k0 5695; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 5696; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 5697; AVX512BW-NEXT: korw %k6, %k0, %k7 5698; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k7} {z} 5699; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 5700; AVX512BW-NEXT: kandw %k0, %k1, %k0 5701; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 5702; AVX512BW-NEXT: korw %k1, %k0, %k0 5703; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5704; AVX512BW-NEXT: kandw %k1, %k0, %k0 5705; AVX512BW-NEXT: kshiftrq $58, %k5, %k1 5706; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 5707; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 5708; AVX512BW-NEXT: korw %k6, %k0, %k0 5709; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5710; AVX512BW-NEXT: kandw %k6, %k0, %k0 5711; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 5712; AVX512BW-NEXT: korw %k6, %k0, %k0 5713; AVX512BW-NEXT: kandw %k4, %k0, %k0 5714; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 5715; AVX512BW-NEXT: korw %k6, %k0, %k0 5716; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 5717; AVX512BW-NEXT: kandw %k4, %k0, %k0 5718; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 5719; AVX512BW-NEXT: korw %k6, %k0, %k0 5720; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5721; AVX512BW-NEXT: kandw %k6, %k0, %k0 5722; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 5723; AVX512BW-NEXT: korw %k1, %k0, %k0 5724; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5725; AVX512BW-NEXT: kandw %k1, %k0, %k0 5726; AVX512BW-NEXT: kshiftrq $59, %k5, %k1 5727; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 5728; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 5729; AVX512BW-NEXT: korw %k6, %k0, %k0 5730; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5731; AVX512BW-NEXT: kandw %k6, %k0, %k0 5732; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 5733; AVX512BW-NEXT: korw %k6, %k0, %k0 5734; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5735; AVX512BW-NEXT: kandw %k6, %k0, %k0 5736; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 5737; AVX512BW-NEXT: korw %k6, %k0, %k0 5738; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5739; AVX512BW-NEXT: kandw %k6, %k0, %k0 5740; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 5741; AVX512BW-NEXT: korw %k6, %k0, %k0 5742; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 5743; AVX512BW-NEXT: kandw %k6, %k0, %k0 5744; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 5745; AVX512BW-NEXT: korw %k1, %k0, %k0 5746; AVX512BW-NEXT: kandw %k3, %k0, %k0 5747; AVX512BW-NEXT: kshiftrq $60, %k5, %k1 5748; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 5749; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 5750; AVX512BW-NEXT: korw %k7, %k0, %k0 5751; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 5752; AVX512BW-NEXT: kandw %k3, %k0, %k0 5753; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 5754; AVX512BW-NEXT: korw %k7, %k0, %k0 5755; AVX512BW-NEXT: kandw %k2, %k0, %k0 5756; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 5757; AVX512BW-NEXT: korw %k7, %k0, %k0 5758; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 5759; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 5760; AVX512BW-NEXT: korw %k6, %k0, %k6 5761; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k6} {z} 5762; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 5763; AVX512BW-NEXT: kandw %k0, %k1, %k0 5764; AVX512BW-NEXT: kshiftrq $61, %k5, %k1 5765; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 5766; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 5767; AVX512BW-NEXT: korw %k6, %k0, %k0 5768; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5769; AVX512BW-NEXT: kandw %k2, %k0, %k0 5770; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 5771; AVX512BW-NEXT: korw %k6, %k0, %k0 5772; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5773; AVX512BW-NEXT: kandw %k2, %k0, %k0 5774; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 5775; AVX512BW-NEXT: korw %k6, %k0, %k0 5776; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5777; AVX512BW-NEXT: kandw %k2, %k0, %k0 5778; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 5779; AVX512BW-NEXT: korw %k6, %k0, %k0 5780; AVX512BW-NEXT: kandw %k4, %k0, %k0 5781; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 5782; AVX512BW-NEXT: korw %k1, %k0, %k0 5783; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5784; AVX512BW-NEXT: kandw %k1, %k0, %k0 5785; AVX512BW-NEXT: kshiftrq $62, %k5, %k1 5786; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 5787; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 5788; AVX512BW-NEXT: korw %k6, %k0, %k0 5789; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5790; AVX512BW-NEXT: kandw %k2, %k0, %k0 5791; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 5792; AVX512BW-NEXT: korw %k6, %k0, %k0 5793; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5794; AVX512BW-NEXT: kandw %k2, %k0, %k0 5795; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 5796; AVX512BW-NEXT: korw %k6, %k0, %k0 5797; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5798; AVX512BW-NEXT: kandw %k2, %k0, %k0 5799; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 5800; AVX512BW-NEXT: korw %k6, %k0, %k0 5801; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5802; AVX512BW-NEXT: kandw %k2, %k0, %k0 5803; AVX512BW-NEXT: kshiftrq $63, %k5, %k5 5804; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 5805; AVX512BW-NEXT: korw %k1, %k0, %k0 5806; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 5807; AVX512BW-NEXT: kandw %k1, %k0, %k0 5808; AVX512BW-NEXT: kshiftlw $15, %k5, %k1 5809; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 5810; AVX512BW-NEXT: korw %k6, %k0, %k0 5811; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5812; AVX512BW-NEXT: kandw %k2, %k0, %k0 5813; AVX512BW-NEXT: kshiftrw $3, %k1, %k4 5814; AVX512BW-NEXT: korw %k4, %k0, %k0 5815; AVX512BW-NEXT: kandw %k3, %k0, %k0 5816; AVX512BW-NEXT: kshiftrw $2, %k1, %k3 5817; AVX512BW-NEXT: korw %k3, %k0, %k0 5818; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 5819; AVX512BW-NEXT: kandw %k2, %k0, %k0 5820; AVX512BW-NEXT: kshiftlw $14, %k5, %k2 5821; AVX512BW-NEXT: korw %k2, %k0, %k0 5822; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 5823; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 5824; AVX512BW-NEXT: korw %k1, %k0, %k1 5825; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z} 5826; AVX512BW-NEXT: vmovdqa64 %zmm19, 1216(%rdx) 5827; AVX512BW-NEXT: vmovdqa64 %zmm18, 1152(%rdx) 5828; AVX512BW-NEXT: vmovdqa64 %zmm17, 1088(%rdx) 5829; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx) 5830; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%rdx) 5831; AVX512BW-NEXT: vmovdqa64 %zmm14, 896(%rdx) 5832; AVX512BW-NEXT: vmovdqa64 %zmm13, 832(%rdx) 5833; AVX512BW-NEXT: vmovdqa64 %zmm12, 768(%rdx) 5834; AVX512BW-NEXT: vmovdqa64 %zmm11, 704(%rdx) 5835; AVX512BW-NEXT: vmovdqa64 %zmm10, 640(%rdx) 5836; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx) 5837; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rdx) 5838; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) 5839; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) 5840; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) 5841; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) 5842; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) 5843; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) 5844; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) 5845; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) 5846; AVX512BW-NEXT: vzeroupper 5847; AVX512BW-NEXT: retq 5848 %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 5849 %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <320 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63> 5850 %data = call <320 x i32> @llvm.masked.load.v320i32.p0(ptr %in.vec, i32 64, <320 x i1> %tgt.mask, <320 x i32> poison) 5851 store <320 x i32> %data, ptr %out.vec, align 64 5852 ret void 5853} 5854 5855define void @mask_replication_factor6_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 5856; AVX512F-ONLY-LABEL: mask_replication_factor6_vf2: 5857; AVX512F-ONLY: # %bb.0: 5858; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 5859; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 5860; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0] 5861; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 5862; AVX512F-ONLY-NEXT: vpslld $31, %zmm0, %zmm0 5863; AVX512F-ONLY-NEXT: movw $4095, %ax # imm = 0xFFF 5864; AVX512F-ONLY-NEXT: kmovw %eax, %k1 5865; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} 5866; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 5867; AVX512F-ONLY-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx) 5868; AVX512F-ONLY-NEXT: vmovdqa %ymm0, (%rdx) 5869; AVX512F-ONLY-NEXT: vzeroupper 5870; AVX512F-ONLY-NEXT: retq 5871; 5872; AVX512DQ-LABEL: mask_replication_factor6_vf2: 5873; AVX512DQ: # %bb.0: 5874; AVX512DQ-NEXT: kmovw (%rdi), %k0 5875; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 5876; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0] 5877; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 5878; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 5879; AVX512DQ-NEXT: movw $4095, %ax # imm = 0xFFF 5880; AVX512DQ-NEXT: kmovw %eax, %k1 5881; AVX512DQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 {%k1} 5882; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 5883; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx) 5884; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) 5885; AVX512DQ-NEXT: vzeroupper 5886; AVX512DQ-NEXT: retq 5887; 5888; AVX512BW-LABEL: mask_replication_factor6_vf2: 5889; AVX512BW: # %bb.0: 5890; AVX512BW-NEXT: kmovw (%rdi), %k1 5891; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 5892; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0] 5893; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 5894; AVX512BW-NEXT: vpslld $31, %zmm0, %zmm0 5895; AVX512BW-NEXT: movw $4095, %ax # imm = 0xFFF 5896; AVX512BW-NEXT: kmovd %eax, %k1 5897; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} 5898; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 5899; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx) 5900; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx) 5901; AVX512BW-NEXT: vzeroupper 5902; AVX512BW-NEXT: retq 5903 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 5904 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1> 5905 %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <12 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 5906 %data = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr %in.vec, i32 64, <12 x i1> %tgt.mask, <12 x i32> poison) 5907 %data.padded = shufflevector <12 x i32> %data, <12 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef> 5908 store <12 x i32> %data, ptr %out.vec, align 64 5909 ret void 5910} 5911 5912define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 5913; AVX512F-SLOW-LABEL: mask_replication_factor6_vf4: 5914; AVX512F-SLOW: # %bb.0: 5915; AVX512F-SLOW-NEXT: kmovw (%rdi), %k1 5916; AVX512F-SLOW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 5917; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] 5918; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] 5919; AVX512F-SLOW-NEXT: vpslld $31, %zmm1, %zmm1 5920; AVX512F-SLOW-NEXT: movw $255, %ax 5921; AVX512F-SLOW-NEXT: kmovw %eax, %k1 5922; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} 5923; AVX512F-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] 5924; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 5925; AVX512F-SLOW-NEXT: vptestmd %zmm0, %zmm0, %k2 5926; AVX512F-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} 5927; AVX512F-SLOW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} 5928; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, (%rdx) 5929; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 64(%rdx) 5930; AVX512F-SLOW-NEXT: vzeroupper 5931; AVX512F-SLOW-NEXT: retq 5932; 5933; AVX512F-FAST-LABEL: mask_replication_factor6_vf4: 5934; AVX512F-FAST: # %bb.0: 5935; AVX512F-FAST-NEXT: kmovw (%rdi), %k1 5936; AVX512F-FAST-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 5937; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3] 5938; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 5939; AVX512F-FAST-NEXT: vpslld $31, %zmm1, %zmm1 5940; AVX512F-FAST-NEXT: movw $255, %ax 5941; AVX512F-FAST-NEXT: kmovw %eax, %k1 5942; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} 5943; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] 5944; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 5945; AVX512F-FAST-NEXT: vptestmd %zmm0, %zmm0, %k2 5946; AVX512F-FAST-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} 5947; AVX512F-FAST-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} 5948; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rdx) 5949; AVX512F-FAST-NEXT: vmovdqa %ymm0, 64(%rdx) 5950; AVX512F-FAST-NEXT: vzeroupper 5951; AVX512F-FAST-NEXT: retq 5952; 5953; AVX512DQ-SLOW-LABEL: mask_replication_factor6_vf4: 5954; AVX512DQ-SLOW: # %bb.0: 5955; AVX512DQ-SLOW-NEXT: kmovw (%rdi), %k0 5956; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %zmm0 5957; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] 5958; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] 5959; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 5960; AVX512DQ-SLOW-NEXT: movw $255, %ax 5961; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 5962; AVX512DQ-SLOW-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1} 5963; AVX512DQ-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] 5964; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 5965; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm0, %k2 5966; AVX512DQ-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} 5967; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} 5968; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rdx) 5969; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 64(%rdx) 5970; AVX512DQ-SLOW-NEXT: vzeroupper 5971; AVX512DQ-SLOW-NEXT: retq 5972; 5973; AVX512DQ-FAST-LABEL: mask_replication_factor6_vf4: 5974; AVX512DQ-FAST: # %bb.0: 5975; AVX512DQ-FAST-NEXT: kmovw (%rdi), %k0 5976; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %zmm0 5977; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3] 5978; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 5979; AVX512DQ-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 5980; AVX512DQ-FAST-NEXT: movw $255, %ax 5981; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 5982; AVX512DQ-FAST-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1} 5983; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] 5984; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 5985; AVX512DQ-FAST-NEXT: vpmovd2m %zmm0, %k2 5986; AVX512DQ-FAST-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} 5987; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} 5988; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rdx) 5989; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 64(%rdx) 5990; AVX512DQ-FAST-NEXT: vzeroupper 5991; AVX512DQ-FAST-NEXT: retq 5992; 5993; AVX512BW-LABEL: mask_replication_factor6_vf4: 5994; AVX512BW: # %bb.0: 5995; AVX512BW-NEXT: kmovd (%rdi), %k0 5996; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 5997; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,0,0,0,0,0,0,0,0] 5998; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 5999; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 6000; AVX512BW-NEXT: movl $16777215, %eax # imm = 0xFFFFFF 6001; AVX512BW-NEXT: kmovd %eax, %k1 6002; AVX512BW-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 {%k1} 6003; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 6004; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} 6005; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} 6006; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) 6007; AVX512BW-NEXT: vmovdqa %ymm0, 64(%rdx) 6008; AVX512BW-NEXT: vzeroupper 6009; AVX512BW-NEXT: retq 6010 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 6011 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6012 %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 6013 %data = call <24 x i32> @llvm.masked.load.v24i32.p0(ptr %in.vec, i32 64, <24 x i1> %tgt.mask, <24 x i32> poison) 6014 %data.padded = shufflevector <24 x i32> %data, <24 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 6015 store <24 x i32> %data, ptr %out.vec, align 64 6016 ret void 6017} 6018 6019define void @mask_replication_factor6_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 6020; AVX512F-ONLY-LABEL: mask_replication_factor6_vf8: 6021; AVX512F-ONLY: # %bb.0: 6022; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 6023; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 6024; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] 6025; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 6026; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 6027; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 6028; AVX512F-ONLY-NEXT: movw $1, %ax 6029; AVX512F-ONLY-NEXT: kmovw %eax, %k1 6030; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 6031; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 6032; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] 6033; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 6034; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 6035; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] 6036; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 6037; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 6038; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 6039; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z} 6040; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} 6041; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) 6042; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 128(%rdx) 6043; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) 6044; AVX512F-ONLY-NEXT: vzeroupper 6045; AVX512F-ONLY-NEXT: retq 6046; 6047; AVX512DQ-LABEL: mask_replication_factor6_vf8: 6048; AVX512DQ: # %bb.0: 6049; AVX512DQ-NEXT: kmovb (%rdi), %k0 6050; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 6051; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] 6052; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 6053; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 6054; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 6055; AVX512DQ-NEXT: movw $1, %ax 6056; AVX512DQ-NEXT: kmovw %eax, %k1 6057; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 6058; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 6059; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] 6060; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 6061; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 6062; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] 6063; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 6064; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 6065; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 6066; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z} 6067; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} 6068; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rdx) 6069; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%rdx) 6070; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) 6071; AVX512DQ-NEXT: vzeroupper 6072; AVX512DQ-NEXT: retq 6073; 6074; AVX512BW-LABEL: mask_replication_factor6_vf8: 6075; AVX512BW: # %bb.0: 6076; AVX512BW-NEXT: kmovw (%rdi), %k1 6077; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 6078; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] 6079; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 6080; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 6081; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} 6082; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] 6083; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 6084; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 6085; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} 6086; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] 6087; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm0 6088; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 6089; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} 6090; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) 6091; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) 6092; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) 6093; AVX512BW-NEXT: vzeroupper 6094; AVX512BW-NEXT: retq 6095 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 6096 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 6097 %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 6098 %data = call <48 x i32> @llvm.masked.load.v48i32.p0(ptr %in.vec, i32 64, <48 x i1> %tgt.mask, <48 x i32> poison) 6099 store <48 x i32> %data, ptr %out.vec, align 64 6100 ret void 6101} 6102 6103define void @mask_replication_factor6_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 6104; AVX512F-ONLY-LABEL: mask_replication_factor6_vf16: 6105; AVX512F-ONLY: # %bb.0: 6106; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 6107; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 6108; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] 6109; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 6110; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 6111; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 6112; AVX512F-ONLY-NEXT: movw $1, %ax 6113; AVX512F-ONLY-NEXT: kmovw %eax, %k1 6114; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 6115; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 6116; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] 6117; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 6118; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 6119; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] 6120; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 6121; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 6122; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] 6123; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 6124; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 6125; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] 6126; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 6127; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 6128; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] 6129; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 6130; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k6 6131; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} 6132; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k6} {z} 6133; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k5} {z} 6134; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} 6135; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k3} {z} 6136; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} 6137; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 64(%rdx) 6138; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 128(%rdx) 6139; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) 6140; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 256(%rdx) 6141; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 320(%rdx) 6142; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) 6143; AVX512F-ONLY-NEXT: vzeroupper 6144; AVX512F-ONLY-NEXT: retq 6145; 6146; AVX512DQ-LABEL: mask_replication_factor6_vf16: 6147; AVX512DQ: # %bb.0: 6148; AVX512DQ-NEXT: kmovw (%rdi), %k0 6149; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 6150; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] 6151; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 6152; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 6153; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 6154; AVX512DQ-NEXT: movw $1, %ax 6155; AVX512DQ-NEXT: kmovw %eax, %k1 6156; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 6157; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 6158; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] 6159; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 6160; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 6161; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] 6162; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 6163; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 6164; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] 6165; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 6166; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 6167; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] 6168; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 6169; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 6170; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] 6171; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 6172; AVX512DQ-NEXT: vpmovd2m %zmm0, %k6 6173; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} 6174; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k6} {z} 6175; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k5} {z} 6176; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} 6177; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k3} {z} 6178; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} 6179; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx) 6180; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rdx) 6181; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) 6182; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rdx) 6183; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rdx) 6184; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) 6185; AVX512DQ-NEXT: vzeroupper 6186; AVX512DQ-NEXT: retq 6187; 6188; AVX512BW-LABEL: mask_replication_factor6_vf16: 6189; AVX512BW: # %bb.0: 6190; AVX512BW-NEXT: kmovw (%rdi), %k1 6191; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 6192; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] 6193; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 6194; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 6195; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} 6196; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] 6197; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 6198; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 6199; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k1} {z} 6200; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] 6201; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm3 6202; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1 6203; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k1} {z} 6204; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] 6205; AVX512BW-NEXT: vpermd %zmm0, %zmm4, %zmm4 6206; AVX512BW-NEXT: vptestmd %zmm4, %zmm4, %k1 6207; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm4 {%k1} {z} 6208; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] 6209; AVX512BW-NEXT: vpermd %zmm0, %zmm5, %zmm5 6210; AVX512BW-NEXT: vptestmd %zmm5, %zmm5, %k1 6211; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z} 6212; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] 6213; AVX512BW-NEXT: vpermd %zmm0, %zmm6, %zmm0 6214; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 6215; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} 6216; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) 6217; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rdx) 6218; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx) 6219; AVX512BW-NEXT: vmovdqa64 %zmm3, 256(%rdx) 6220; AVX512BW-NEXT: vmovdqa64 %zmm2, 320(%rdx) 6221; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) 6222; AVX512BW-NEXT: vzeroupper 6223; AVX512BW-NEXT: retq 6224 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 6225 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 6226 %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <96 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 6227 %data = call <96 x i32> @llvm.masked.load.v96i32.p0(ptr %in.vec, i32 64, <96 x i1> %tgt.mask, <96 x i32> poison) 6228 store <96 x i32> %data, ptr %out.vec, align 64 6229 ret void 6230} 6231 6232define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 6233; AVX512F-ONLY-LABEL: mask_replication_factor6_vf32: 6234; AVX512F-ONLY: # %bb.0: 6235; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 6236; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 6237; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] 6238; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 6239; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 6240; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1 6241; AVX512F-ONLY-NEXT: movw $1, %ax 6242; AVX512F-ONLY-NEXT: kmovw %eax, %k1 6243; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} 6244; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 6245; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 6246; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 6247; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] 6248; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm4 6249; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] 6250; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 6251; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] 6252; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 6253; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] 6254; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10 6255; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm11 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] 6256; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm0 6257; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm1 6258; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm2 6259; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm5, %zmm5 6260; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm7 6261; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm9 6262; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm3 6263; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z} 6264; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 6265; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k1} {z} 6266; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 6267; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm9 {%k1} {z} 6268; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 6269; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm7 {%k1} {z} 6270; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 6271; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm5 {%k1} {z} 6272; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 6273; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z} 6274; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 6275; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z} 6276; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 6277; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm0 {%k1} {z} 6278; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 6279; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} 6280; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 6281; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} 6282; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 6283; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} 6284; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 6285; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} 6286; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx) 6287; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 128(%rdx) 6288; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx) 6289; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 256(%rdx) 6290; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 320(%rdx) 6291; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 384(%rdx) 6292; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 448(%rdx) 6293; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 512(%rdx) 6294; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 576(%rdx) 6295; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 640(%rdx) 6296; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 704(%rdx) 6297; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, (%rdx) 6298; AVX512F-ONLY-NEXT: vzeroupper 6299; AVX512F-ONLY-NEXT: retq 6300; 6301; AVX512DQ-LABEL: mask_replication_factor6_vf32: 6302; AVX512DQ: # %bb.0: 6303; AVX512DQ-NEXT: kmovw (%rdi), %k0 6304; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 6305; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] 6306; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 6307; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 6308; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 6309; AVX512DQ-NEXT: movw $1, %ax 6310; AVX512DQ-NEXT: kmovw %eax, %k1 6311; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} 6312; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 6313; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 6314; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 6315; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] 6316; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm4 6317; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] 6318; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 6319; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] 6320; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 6321; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] 6322; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10 6323; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] 6324; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm0 6325; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm1 6326; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm2 6327; AVX512DQ-NEXT: vpermd %zmm3, %zmm5, %zmm5 6328; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm7 6329; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm9 6330; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm3 6331; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z} 6332; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 6333; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k1} {z} 6334; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 6335; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm9 {%k1} {z} 6336; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 6337; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm7 {%k1} {z} 6338; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 6339; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm5 {%k1} {z} 6340; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 6341; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z} 6342; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 6343; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z} 6344; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 6345; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm0 {%k1} {z} 6346; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 6347; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} 6348; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 6349; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} 6350; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 6351; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} 6352; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 6353; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} 6354; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx) 6355; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rdx) 6356; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx) 6357; AVX512DQ-NEXT: vmovdqa64 %zmm10, 256(%rdx) 6358; AVX512DQ-NEXT: vmovdqa64 %zmm0, 320(%rdx) 6359; AVX512DQ-NEXT: vmovdqa64 %zmm1, 384(%rdx) 6360; AVX512DQ-NEXT: vmovdqa64 %zmm2, 448(%rdx) 6361; AVX512DQ-NEXT: vmovdqa64 %zmm5, 512(%rdx) 6362; AVX512DQ-NEXT: vmovdqa64 %zmm7, 576(%rdx) 6363; AVX512DQ-NEXT: vmovdqa64 %zmm9, 640(%rdx) 6364; AVX512DQ-NEXT: vmovdqa64 %zmm3, 704(%rdx) 6365; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rdx) 6366; AVX512DQ-NEXT: vzeroupper 6367; AVX512DQ-NEXT: retq 6368; 6369; AVX512BW-LABEL: mask_replication_factor6_vf32: 6370; AVX512BW: # %bb.0: 6371; AVX512BW-NEXT: kmovd (%rdi), %k5 6372; AVX512BW-NEXT: movw $-3, %ax 6373; AVX512BW-NEXT: kmovd %eax, %k0 6374; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 6375; AVX512BW-NEXT: kmovw (%rdi), %k1 6376; AVX512BW-NEXT: kandw %k0, %k1, %k2 6377; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 6378; AVX512BW-NEXT: kshiftrw $14, %k1, %k3 6379; AVX512BW-NEXT: korw %k3, %k2, %k2 6380; AVX512BW-NEXT: movw $-5, %ax 6381; AVX512BW-NEXT: kmovd %eax, %k0 6382; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 6383; AVX512BW-NEXT: kandw %k0, %k2, %k2 6384; AVX512BW-NEXT: kshiftrw $13, %k1, %k3 6385; AVX512BW-NEXT: korw %k3, %k2, %k2 6386; AVX512BW-NEXT: movw $-9, %ax 6387; AVX512BW-NEXT: kmovd %eax, %k0 6388; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 6389; AVX512BW-NEXT: kandw %k0, %k2, %k2 6390; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 6391; AVX512BW-NEXT: korw %k3, %k2, %k2 6392; AVX512BW-NEXT: movw $-17, %ax 6393; AVX512BW-NEXT: kmovd %eax, %k7 6394; AVX512BW-NEXT: kandw %k7, %k2, %k2 6395; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 6396; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 6397; AVX512BW-NEXT: korw %k3, %k2, %k2 6398; AVX512BW-NEXT: movw $-33, %ax 6399; AVX512BW-NEXT: kmovd %eax, %k0 6400; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 6401; AVX512BW-NEXT: kandw %k0, %k2, %k2 6402; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 6403; AVX512BW-NEXT: korw %k1, %k2, %k1 6404; AVX512BW-NEXT: movw $-65, %ax 6405; AVX512BW-NEXT: kmovd %eax, %k0 6406; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 6407; AVX512BW-NEXT: kandw %k0, %k1, %k2 6408; AVX512BW-NEXT: kshiftrd $1, %k5, %k1 6409; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 6410; AVX512BW-NEXT: kshiftrw $9, %k1, %k3 6411; AVX512BW-NEXT: korw %k3, %k2, %k2 6412; AVX512BW-NEXT: movw $-129, %ax 6413; AVX512BW-NEXT: kmovd %eax, %k0 6414; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 6415; AVX512BW-NEXT: kandw %k0, %k2, %k2 6416; AVX512BW-NEXT: kshiftrw $8, %k1, %k3 6417; AVX512BW-NEXT: korw %k3, %k2, %k2 6418; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF 6419; AVX512BW-NEXT: kmovd %eax, %k0 6420; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 6421; AVX512BW-NEXT: kandw %k0, %k2, %k2 6422; AVX512BW-NEXT: kshiftrw $7, %k1, %k3 6423; AVX512BW-NEXT: korw %k3, %k2, %k2 6424; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF 6425; AVX512BW-NEXT: kmovd %eax, %k0 6426; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 6427; AVX512BW-NEXT: kandw %k0, %k2, %k2 6428; AVX512BW-NEXT: kshiftrw $6, %k1, %k3 6429; AVX512BW-NEXT: korw %k3, %k2, %k2 6430; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF 6431; AVX512BW-NEXT: kmovd %eax, %k6 6432; AVX512BW-NEXT: kandw %k6, %k2, %k2 6433; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 6434; AVX512BW-NEXT: kshiftrw $5, %k1, %k3 6435; AVX512BW-NEXT: korw %k3, %k2, %k2 6436; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF 6437; AVX512BW-NEXT: kmovd %eax, %k0 6438; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 6439; AVX512BW-NEXT: kandw %k0, %k2, %k2 6440; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 6441; AVX512BW-NEXT: korw %k1, %k2, %k1 6442; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF 6443; AVX512BW-NEXT: kmovd %eax, %k0 6444; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 6445; AVX512BW-NEXT: kandw %k0, %k1, %k1 6446; AVX512BW-NEXT: kshiftrd $2, %k5, %k4 6447; AVX512BW-NEXT: kshiftlw $15, %k4, %k3 6448; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 6449; AVX512BW-NEXT: kshiftrw $3, %k3, %k2 6450; AVX512BW-NEXT: korw %k2, %k1, %k1 6451; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF 6452; AVX512BW-NEXT: kmovd %eax, %k2 6453; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 6454; AVX512BW-NEXT: kandw %k2, %k1, %k1 6455; AVX512BW-NEXT: kshiftrw $2, %k3, %k2 6456; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 6457; AVX512BW-NEXT: korw %k2, %k1, %k1 6458; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF 6459; AVX512BW-NEXT: kmovd %eax, %k2 6460; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 6461; AVX512BW-NEXT: kandw %k2, %k1, %k1 6462; AVX512BW-NEXT: kshiftlw $14, %k4, %k2 6463; AVX512BW-NEXT: korw %k2, %k1, %k1 6464; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 6465; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 6466; AVX512BW-NEXT: korw %k3, %k1, %k1 6467; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 6468; AVX512BW-NEXT: kshiftrd $29, %k5, %k0 6469; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 6470; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 6471; AVX512BW-NEXT: kandw %k1, %k0, %k1 6472; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 6473; AVX512BW-NEXT: kshiftrw $14, %k0, %k4 6474; AVX512BW-NEXT: korw %k4, %k1, %k1 6475; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 6476; AVX512BW-NEXT: kandw %k2, %k1, %k1 6477; AVX512BW-NEXT: kshiftrw $13, %k0, %k4 6478; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 6479; AVX512BW-NEXT: korw %k4, %k1, %k1 6480; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 6481; AVX512BW-NEXT: kandw %k3, %k1, %k1 6482; AVX512BW-NEXT: kshiftrw $12, %k0, %k4 6483; AVX512BW-NEXT: korw %k4, %k1, %k1 6484; AVX512BW-NEXT: kandw %k7, %k1, %k1 6485; AVX512BW-NEXT: kshiftrd $30, %k5, %k4 6486; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 6487; AVX512BW-NEXT: kshiftrw $11, %k4, %k7 6488; AVX512BW-NEXT: korw %k7, %k1, %k1 6489; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 6490; AVX512BW-NEXT: kandw %k3, %k1, %k1 6491; AVX512BW-NEXT: kshiftrw $10, %k4, %k7 6492; AVX512BW-NEXT: korw %k7, %k1, %k1 6493; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6494; AVX512BW-NEXT: kandw %k0, %k1, %k1 6495; AVX512BW-NEXT: kshiftrw $9, %k4, %k7 6496; AVX512BW-NEXT: korw %k7, %k1, %k1 6497; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6498; AVX512BW-NEXT: kandw %k0, %k1, %k1 6499; AVX512BW-NEXT: kshiftrw $8, %k4, %k7 6500; AVX512BW-NEXT: korw %k7, %k1, %k1 6501; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 6502; AVX512BW-NEXT: kandw %k3, %k1, %k1 6503; AVX512BW-NEXT: kshiftrw $7, %k4, %k7 6504; AVX512BW-NEXT: korw %k7, %k1, %k1 6505; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 6506; AVX512BW-NEXT: kandw %k3, %k1, %k1 6507; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 6508; AVX512BW-NEXT: korw %k4, %k1, %k1 6509; AVX512BW-NEXT: kandw %k6, %k1, %k4 6510; AVX512BW-NEXT: kshiftrd $31, %k5, %k7 6511; AVX512BW-NEXT: kshiftlw $15, %k7, %k1 6512; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 6513; AVX512BW-NEXT: korw %k6, %k4, %k4 6514; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6515; AVX512BW-NEXT: kandw %k0, %k4, %k4 6516; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 6517; AVX512BW-NEXT: korw %k6, %k4, %k4 6518; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6519; AVX512BW-NEXT: kandw %k0, %k4, %k4 6520; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 6521; AVX512BW-NEXT: korw %k6, %k4, %k4 6522; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 6523; AVX512BW-NEXT: kandw %k6, %k4, %k4 6524; AVX512BW-NEXT: kshiftrw $2, %k1, %k6 6525; AVX512BW-NEXT: korw %k6, %k4, %k4 6526; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 6527; AVX512BW-NEXT: kandw %k6, %k4, %k4 6528; AVX512BW-NEXT: kshiftlw $14, %k7, %k6 6529; AVX512BW-NEXT: korw %k6, %k4, %k4 6530; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 6531; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 6532; AVX512BW-NEXT: korw %k1, %k4, %k1 6533; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z} 6534; AVX512BW-NEXT: kshiftrd $26, %k5, %k4 6535; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 6536; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 6537; AVX512BW-NEXT: kandw %k1, %k4, %k6 6538; AVX512BW-NEXT: kshiftlw $15, %k4, %k1 6539; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 6540; AVX512BW-NEXT: kshiftrw $14, %k1, %k7 6541; AVX512BW-NEXT: korw %k7, %k6, %k6 6542; AVX512BW-NEXT: kandw %k2, %k6, %k6 6543; AVX512BW-NEXT: kshiftrd $27, %k5, %k7 6544; AVX512BW-NEXT: kmovq %k5, %k2 6545; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 6546; AVX512BW-NEXT: kshiftlw $15, %k7, %k7 6547; AVX512BW-NEXT: kshiftrw $13, %k7, %k5 6548; AVX512BW-NEXT: korw %k5, %k6, %k5 6549; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 6550; AVX512BW-NEXT: kandw %k4, %k5, %k5 6551; AVX512BW-NEXT: kshiftrw $12, %k7, %k6 6552; AVX512BW-NEXT: korw %k6, %k5, %k5 6553; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 6554; AVX512BW-NEXT: kandw %k1, %k5, %k5 6555; AVX512BW-NEXT: kshiftrw $11, %k7, %k6 6556; AVX512BW-NEXT: korw %k6, %k5, %k5 6557; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 6558; AVX512BW-NEXT: kandw %k1, %k5, %k5 6559; AVX512BW-NEXT: kshiftrw $10, %k7, %k6 6560; AVX512BW-NEXT: korw %k6, %k5, %k5 6561; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 6562; AVX512BW-NEXT: kandw %k1, %k5, %k5 6563; AVX512BW-NEXT: kshiftrw $9, %k7, %k6 6564; AVX512BW-NEXT: korw %k6, %k5, %k5 6565; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 6566; AVX512BW-NEXT: kandw %k1, %k5, %k5 6567; AVX512BW-NEXT: kshiftrw $8, %k7, %k6 6568; AVX512BW-NEXT: korw %k6, %k5, %k5 6569; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 6570; AVX512BW-NEXT: kandw %k1, %k5, %k5 6571; AVX512BW-NEXT: kshiftrd $28, %k2, %k6 6572; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 6573; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 6574; AVX512BW-NEXT: korw %k7, %k5, %k5 6575; AVX512BW-NEXT: kandw %k3, %k5, %k5 6576; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 6577; AVX512BW-NEXT: korw %k7, %k5, %k5 6578; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 6579; AVX512BW-NEXT: kandw %k1, %k5, %k5 6580; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 6581; AVX512BW-NEXT: korw %k7, %k5, %k5 6582; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 6583; AVX512BW-NEXT: kandw %k1, %k5, %k5 6584; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 6585; AVX512BW-NEXT: korw %k7, %k5, %k5 6586; AVX512BW-NEXT: kandw %k0, %k5, %k5 6587; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 6588; AVX512BW-NEXT: korw %k7, %k5, %k5 6589; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 6590; AVX512BW-NEXT: kandw %k1, %k5, %k5 6591; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 6592; AVX512BW-NEXT: korw %k6, %k5, %k5 6593; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 6594; AVX512BW-NEXT: kandw %k7, %k5, %k5 6595; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload 6596; AVX512BW-NEXT: kshiftlw $14, %k2, %k3 6597; AVX512BW-NEXT: korw %k3, %k5, %k3 6598; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 6599; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 6600; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 6601; AVX512BW-NEXT: korw %k2, %k3, %k2 6602; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k2} {z} 6603; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload 6604; AVX512BW-NEXT: kshiftrd $24, %k0, %k2 6605; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 6606; AVX512BW-NEXT: kandw %k3, %k2, %k3 6607; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 6608; AVX512BW-NEXT: kshiftrw $14, %k2, %k5 6609; AVX512BW-NEXT: korw %k5, %k3, %k3 6610; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 6611; AVX512BW-NEXT: kandw %k5, %k3, %k3 6612; AVX512BW-NEXT: kshiftrw $13, %k2, %k5 6613; AVX512BW-NEXT: korw %k5, %k3, %k3 6614; AVX512BW-NEXT: kandw %k4, %k3, %k3 6615; AVX512BW-NEXT: kshiftrw $12, %k2, %k5 6616; AVX512BW-NEXT: korw %k5, %k3, %k3 6617; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 6618; AVX512BW-NEXT: kandw %k6, %k3, %k3 6619; AVX512BW-NEXT: kshiftrw $11, %k2, %k5 6620; AVX512BW-NEXT: korw %k5, %k3, %k3 6621; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 6622; AVX512BW-NEXT: kandw %k4, %k3, %k3 6623; AVX512BW-NEXT: kshiftrw $10, %k2, %k2 6624; AVX512BW-NEXT: korw %k2, %k3, %k2 6625; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 6626; AVX512BW-NEXT: kandw %k3, %k2, %k2 6627; AVX512BW-NEXT: kshiftrd $25, %k0, %k3 6628; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 6629; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 6630; AVX512BW-NEXT: korw %k5, %k2, %k2 6631; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 6632; AVX512BW-NEXT: kandw %k4, %k2, %k2 6633; AVX512BW-NEXT: kshiftrw $8, %k3, %k5 6634; AVX512BW-NEXT: korw %k5, %k2, %k2 6635; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 6636; AVX512BW-NEXT: kandw %k4, %k2, %k2 6637; AVX512BW-NEXT: kshiftrw $7, %k3, %k5 6638; AVX512BW-NEXT: korw %k5, %k2, %k2 6639; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 6640; AVX512BW-NEXT: kandw %k4, %k2, %k2 6641; AVX512BW-NEXT: kshiftrw $6, %k3, %k5 6642; AVX512BW-NEXT: korw %k5, %k2, %k2 6643; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 6644; AVX512BW-NEXT: kandw %k4, %k2, %k2 6645; AVX512BW-NEXT: kshiftrw $5, %k3, %k5 6646; AVX512BW-NEXT: korw %k5, %k2, %k2 6647; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 6648; AVX512BW-NEXT: kandw %k4, %k2, %k2 6649; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 6650; AVX512BW-NEXT: korw %k3, %k2, %k2 6651; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 6652; AVX512BW-NEXT: kandw %k3, %k2, %k2 6653; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 6654; AVX512BW-NEXT: kshiftrw $3, %k4, %k3 6655; AVX512BW-NEXT: korw %k3, %k2, %k2 6656; AVX512BW-NEXT: kandw %k1, %k2, %k2 6657; AVX512BW-NEXT: kshiftrw $2, %k4, %k3 6658; AVX512BW-NEXT: korw %k3, %k2, %k2 6659; AVX512BW-NEXT: kandw %k7, %k2, %k2 6660; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload 6661; AVX512BW-NEXT: kshiftlw $14, %k0, %k3 6662; AVX512BW-NEXT: korw %k3, %k2, %k2 6663; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 6664; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 6665; AVX512BW-NEXT: korw %k4, %k2, %k1 6666; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} 6667; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload 6668; AVX512BW-NEXT: kshiftrd $21, %k7, %k2 6669; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 6670; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6671; AVX512BW-NEXT: kandw %k0, %k2, %k3 6672; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 6673; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 6674; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 6675; AVX512BW-NEXT: korw %k4, %k3, %k3 6676; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6677; AVX512BW-NEXT: kandw %k0, %k3, %k3 6678; AVX512BW-NEXT: kshiftrw $13, %k2, %k4 6679; AVX512BW-NEXT: korw %k4, %k3, %k3 6680; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6681; AVX512BW-NEXT: kandw %k0, %k3, %k3 6682; AVX512BW-NEXT: kshiftrw $12, %k2, %k4 6683; AVX512BW-NEXT: korw %k4, %k3, %k3 6684; AVX512BW-NEXT: kandw %k6, %k3, %k3 6685; AVX512BW-NEXT: kshiftrd $22, %k7, %k4 6686; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 6687; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 6688; AVX512BW-NEXT: korw %k5, %k3, %k3 6689; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6690; AVX512BW-NEXT: kandw %k0, %k3, %k3 6691; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 6692; AVX512BW-NEXT: korw %k5, %k3, %k3 6693; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6694; AVX512BW-NEXT: kandw %k0, %k3, %k3 6695; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 6696; AVX512BW-NEXT: korw %k5, %k3, %k3 6697; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 6698; AVX512BW-NEXT: kandw %k1, %k3, %k3 6699; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 6700; AVX512BW-NEXT: korw %k5, %k3, %k3 6701; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 6702; AVX512BW-NEXT: kandw %k2, %k3, %k3 6703; AVX512BW-NEXT: kshiftrw $7, %k4, %k5 6704; AVX512BW-NEXT: korw %k5, %k3, %k3 6705; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 6706; AVX512BW-NEXT: kandw %k5, %k3, %k3 6707; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 6708; AVX512BW-NEXT: korw %k4, %k3, %k3 6709; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 6710; AVX512BW-NEXT: kandw %k4, %k3, %k4 6711; AVX512BW-NEXT: kshiftrd $23, %k7, %k5 6712; AVX512BW-NEXT: kshiftlw $15, %k5, %k3 6713; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 6714; AVX512BW-NEXT: korw %k6, %k4, %k4 6715; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 6716; AVX512BW-NEXT: kandw %k6, %k4, %k4 6717; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 6718; AVX512BW-NEXT: korw %k6, %k4, %k4 6719; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 6720; AVX512BW-NEXT: kandw %k6, %k4, %k4 6721; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 6722; AVX512BW-NEXT: korw %k6, %k4, %k4 6723; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 6724; AVX512BW-NEXT: kandw %k6, %k4, %k4 6725; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 6726; AVX512BW-NEXT: korw %k6, %k4, %k4 6727; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 6728; AVX512BW-NEXT: kandw %k6, %k4, %k4 6729; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 6730; AVX512BW-NEXT: korw %k5, %k4, %k4 6731; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 6732; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 6733; AVX512BW-NEXT: korw %k3, %k4, %k3 6734; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k3} {z} 6735; AVX512BW-NEXT: kmovq %k7, %k4 6736; AVX512BW-NEXT: kshiftrd $18, %k7, %k6 6737; AVX512BW-NEXT: kmovd %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 6738; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 6739; AVX512BW-NEXT: kandw %k3, %k6, %k5 6740; AVX512BW-NEXT: kshiftlw $15, %k6, %k3 6741; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 6742; AVX512BW-NEXT: kshiftrw $14, %k3, %k6 6743; AVX512BW-NEXT: korw %k6, %k5, %k5 6744; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 6745; AVX512BW-NEXT: kandw %k3, %k5, %k5 6746; AVX512BW-NEXT: kshiftrd $19, %k7, %k6 6747; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 6748; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 6749; AVX512BW-NEXT: korw %k7, %k5, %k5 6750; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 6751; AVX512BW-NEXT: kandw %k3, %k5, %k5 6752; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 6753; AVX512BW-NEXT: korw %k7, %k5, %k5 6754; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 6755; AVX512BW-NEXT: kandw %k3, %k5, %k5 6756; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 6757; AVX512BW-NEXT: korw %k7, %k5, %k5 6758; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 6759; AVX512BW-NEXT: kandw %k3, %k5, %k5 6760; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 6761; AVX512BW-NEXT: korw %k7, %k5, %k5 6762; AVX512BW-NEXT: kandw %k0, %k5, %k5 6763; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 6764; AVX512BW-NEXT: korw %k7, %k5, %k5 6765; AVX512BW-NEXT: kandw %k1, %k5, %k5 6766; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 6767; AVX512BW-NEXT: korw %k6, %k5, %k5 6768; AVX512BW-NEXT: kandw %k2, %k5, %k5 6769; AVX512BW-NEXT: kshiftrd $20, %k4, %k6 6770; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 6771; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 6772; AVX512BW-NEXT: korw %k7, %k5, %k5 6773; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6774; AVX512BW-NEXT: kandw %k0, %k5, %k5 6775; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 6776; AVX512BW-NEXT: korw %k7, %k5, %k5 6777; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6778; AVX512BW-NEXT: kandw %k0, %k5, %k5 6779; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 6780; AVX512BW-NEXT: korw %k7, %k5, %k5 6781; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 6782; AVX512BW-NEXT: kandw %k3, %k5, %k5 6783; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 6784; AVX512BW-NEXT: korw %k7, %k5, %k5 6785; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 6786; AVX512BW-NEXT: kandw %k1, %k5, %k5 6787; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 6788; AVX512BW-NEXT: korw %k7, %k5, %k5 6789; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 6790; AVX512BW-NEXT: kandw %k1, %k5, %k5 6791; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 6792; AVX512BW-NEXT: korw %k6, %k5, %k5 6793; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 6794; AVX512BW-NEXT: kandw %k1, %k5, %k5 6795; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload 6796; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 6797; AVX512BW-NEXT: korw %k2, %k5, %k2 6798; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 6799; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 6800; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6801; AVX512BW-NEXT: korw %k0, %k2, %k1 6802; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} 6803; AVX512BW-NEXT: kmovq %k4, %k0 6804; AVX512BW-NEXT: kshiftrd $16, %k4, %k1 6805; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 6806; AVX512BW-NEXT: kandw %k2, %k1, %k2 6807; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 6808; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 6809; AVX512BW-NEXT: korw %k5, %k2, %k2 6810; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 6811; AVX512BW-NEXT: kandw %k6, %k2, %k2 6812; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 6813; AVX512BW-NEXT: korw %k5, %k2, %k2 6814; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 6815; AVX512BW-NEXT: kandw %k7, %k2, %k2 6816; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 6817; AVX512BW-NEXT: korw %k5, %k2, %k2 6818; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 6819; AVX512BW-NEXT: kandw %k5, %k2, %k2 6820; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 6821; AVX512BW-NEXT: korw %k5, %k2, %k2 6822; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 6823; AVX512BW-NEXT: kandw %k4, %k2, %k2 6824; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 6825; AVX512BW-NEXT: korw %k1, %k2, %k1 6826; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 6827; AVX512BW-NEXT: kandw %k4, %k1, %k1 6828; AVX512BW-NEXT: kshiftrd $17, %k0, %k2 6829; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 6830; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 6831; AVX512BW-NEXT: korw %k5, %k1, %k1 6832; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 6833; AVX512BW-NEXT: kandw %k5, %k1, %k1 6834; AVX512BW-NEXT: kshiftrw $8, %k2, %k5 6835; AVX512BW-NEXT: korw %k5, %k1, %k1 6836; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 6837; AVX512BW-NEXT: kandw %k5, %k1, %k1 6838; AVX512BW-NEXT: kshiftrw $7, %k2, %k5 6839; AVX512BW-NEXT: korw %k5, %k1, %k1 6840; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 6841; AVX512BW-NEXT: kandw %k5, %k1, %k1 6842; AVX512BW-NEXT: kshiftrw $6, %k2, %k5 6843; AVX512BW-NEXT: korw %k5, %k1, %k1 6844; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6845; AVX512BW-NEXT: kandw %k0, %k1, %k1 6846; AVX512BW-NEXT: kshiftrw $5, %k2, %k5 6847; AVX512BW-NEXT: korw %k5, %k1, %k1 6848; AVX512BW-NEXT: kandw %k3, %k1, %k1 6849; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 6850; AVX512BW-NEXT: korw %k2, %k1, %k1 6851; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6852; AVX512BW-NEXT: kandw %k0, %k1, %k1 6853; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 6854; AVX512BW-NEXT: kshiftrw $3, %k3, %k2 6855; AVX512BW-NEXT: korw %k2, %k1, %k1 6856; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6857; AVX512BW-NEXT: kandw %k0, %k1, %k1 6858; AVX512BW-NEXT: kshiftrw $2, %k3, %k2 6859; AVX512BW-NEXT: korw %k2, %k1, %k1 6860; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6861; AVX512BW-NEXT: kandw %k0, %k1, %k1 6862; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload 6863; AVX512BW-NEXT: kshiftlw $14, %k0, %k2 6864; AVX512BW-NEXT: korw %k2, %k1, %k1 6865; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 6866; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 6867; AVX512BW-NEXT: korw %k3, %k1, %k1 6868; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} 6869; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload 6870; AVX512BW-NEXT: kshiftrd $13, %k0, %k3 6871; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 6872; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 6873; AVX512BW-NEXT: kandw %k1, %k3, %k2 6874; AVX512BW-NEXT: kshiftlw $15, %k3, %k5 6875; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 6876; AVX512BW-NEXT: kshiftrw $14, %k5, %k3 6877; AVX512BW-NEXT: korw %k3, %k2, %k2 6878; AVX512BW-NEXT: kandw %k6, %k2, %k2 6879; AVX512BW-NEXT: kshiftrw $13, %k5, %k3 6880; AVX512BW-NEXT: korw %k3, %k2, %k2 6881; AVX512BW-NEXT: kandw %k7, %k2, %k2 6882; AVX512BW-NEXT: kshiftrw $12, %k5, %k3 6883; AVX512BW-NEXT: korw %k3, %k2, %k2 6884; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 6885; AVX512BW-NEXT: kandw %k3, %k2, %k2 6886; AVX512BW-NEXT: kshiftrd $14, %k0, %k3 6887; AVX512BW-NEXT: kmovq %k0, %k7 6888; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 6889; AVX512BW-NEXT: kshiftrw $11, %k3, %k5 6890; AVX512BW-NEXT: korw %k5, %k2, %k2 6891; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6892; AVX512BW-NEXT: kandw %k0, %k2, %k2 6893; AVX512BW-NEXT: kshiftrw $10, %k3, %k5 6894; AVX512BW-NEXT: korw %k5, %k2, %k2 6895; AVX512BW-NEXT: kandw %k4, %k2, %k2 6896; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 6897; AVX512BW-NEXT: korw %k5, %k2, %k2 6898; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6899; AVX512BW-NEXT: kandw %k0, %k2, %k2 6900; AVX512BW-NEXT: kshiftrw $8, %k3, %k5 6901; AVX512BW-NEXT: korw %k5, %k2, %k2 6902; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 6903; AVX512BW-NEXT: kandw %k4, %k2, %k2 6904; AVX512BW-NEXT: kshiftrw $7, %k3, %k5 6905; AVX512BW-NEXT: korw %k5, %k2, %k2 6906; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6907; AVX512BW-NEXT: kandw %k0, %k2, %k2 6908; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 6909; AVX512BW-NEXT: korw %k3, %k2, %k2 6910; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6911; AVX512BW-NEXT: kandw %k0, %k2, %k3 6912; AVX512BW-NEXT: kshiftrd $15, %k7, %k5 6913; AVX512BW-NEXT: kshiftlw $15, %k5, %k2 6914; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 6915; AVX512BW-NEXT: korw %k6, %k3, %k3 6916; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6917; AVX512BW-NEXT: kandw %k0, %k3, %k3 6918; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 6919; AVX512BW-NEXT: korw %k6, %k3, %k3 6920; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6921; AVX512BW-NEXT: kandw %k0, %k3, %k3 6922; AVX512BW-NEXT: kshiftrw $3, %k2, %k6 6923; AVX512BW-NEXT: korw %k6, %k3, %k3 6924; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6925; AVX512BW-NEXT: kandw %k0, %k3, %k3 6926; AVX512BW-NEXT: kshiftrw $2, %k2, %k6 6927; AVX512BW-NEXT: korw %k6, %k3, %k3 6928; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6929; AVX512BW-NEXT: kandw %k0, %k3, %k3 6930; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 6931; AVX512BW-NEXT: korw %k5, %k3, %k3 6932; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 6933; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 6934; AVX512BW-NEXT: korw %k2, %k3, %k2 6935; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k2} {z} 6936; AVX512BW-NEXT: kmovq %k7, %k2 6937; AVX512BW-NEXT: kshiftrd $10, %k7, %k0 6938; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 6939; AVX512BW-NEXT: kandw %k1, %k0, %k5 6940; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 6941; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 6942; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 6943; AVX512BW-NEXT: korw %k6, %k5, %k5 6944; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6945; AVX512BW-NEXT: kandw %k0, %k5, %k5 6946; AVX512BW-NEXT: kshiftrd $11, %k7, %k6 6947; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 6948; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 6949; AVX512BW-NEXT: korw %k7, %k5, %k5 6950; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6951; AVX512BW-NEXT: kandw %k0, %k5, %k5 6952; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 6953; AVX512BW-NEXT: korw %k7, %k5, %k5 6954; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 6955; AVX512BW-NEXT: kandw %k0, %k5, %k5 6956; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 6957; AVX512BW-NEXT: korw %k7, %k5, %k5 6958; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 6959; AVX512BW-NEXT: kandw %k1, %k5, %k5 6960; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 6961; AVX512BW-NEXT: korw %k7, %k5, %k5 6962; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 6963; AVX512BW-NEXT: kandw %k1, %k5, %k5 6964; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 6965; AVX512BW-NEXT: korw %k7, %k5, %k5 6966; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 6967; AVX512BW-NEXT: kandw %k1, %k5, %k5 6968; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 6969; AVX512BW-NEXT: korw %k6, %k5, %k5 6970; AVX512BW-NEXT: kandw %k4, %k5, %k5 6971; AVX512BW-NEXT: kshiftrd $12, %k2, %k6 6972; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 6973; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 6974; AVX512BW-NEXT: korw %k7, %k5, %k5 6975; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 6976; AVX512BW-NEXT: kandw %k1, %k5, %k5 6977; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 6978; AVX512BW-NEXT: korw %k7, %k5, %k5 6979; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 6980; AVX512BW-NEXT: kandw %k1, %k5, %k5 6981; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 6982; AVX512BW-NEXT: korw %k7, %k5, %k5 6983; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 6984; AVX512BW-NEXT: kandw %k1, %k5, %k5 6985; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 6986; AVX512BW-NEXT: korw %k7, %k5, %k5 6987; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 6988; AVX512BW-NEXT: kandw %k1, %k5, %k5 6989; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 6990; AVX512BW-NEXT: korw %k7, %k5, %k5 6991; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 6992; AVX512BW-NEXT: kandw %k7, %k5, %k5 6993; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 6994; AVX512BW-NEXT: korw %k6, %k5, %k5 6995; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 6996; AVX512BW-NEXT: kandw %k3, %k5, %k5 6997; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload 6998; AVX512BW-NEXT: kshiftlw $14, %k1, %k4 6999; AVX512BW-NEXT: korw %k4, %k5, %k4 7000; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 7001; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 7002; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 7003; AVX512BW-NEXT: korw %k1, %k4, %k1 7004; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k1} {z} 7005; AVX512BW-NEXT: kshiftrd $8, %k2, %k1 7006; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7007; AVX512BW-NEXT: kandw %k6, %k1, %k4 7008; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 7009; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 7010; AVX512BW-NEXT: korw %k5, %k4, %k4 7011; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 7012; AVX512BW-NEXT: kandw %k5, %k4, %k4 7013; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 7014; AVX512BW-NEXT: korw %k5, %k4, %k4 7015; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 7016; AVX512BW-NEXT: kandw %k5, %k4, %k4 7017; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 7018; AVX512BW-NEXT: korw %k5, %k4, %k4 7019; AVX512BW-NEXT: kandw %k0, %k4, %k4 7020; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 7021; AVX512BW-NEXT: korw %k5, %k4, %k4 7022; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 7023; AVX512BW-NEXT: kandw %k0, %k4, %k4 7024; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 7025; AVX512BW-NEXT: korw %k1, %k4, %k1 7026; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 7027; AVX512BW-NEXT: kandw %k0, %k1, %k1 7028; AVX512BW-NEXT: kshiftrd $9, %k2, %k4 7029; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 7030; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 7031; AVX512BW-NEXT: korw %k5, %k1, %k1 7032; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 7033; AVX512BW-NEXT: kandw %k0, %k1, %k1 7034; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 7035; AVX512BW-NEXT: korw %k5, %k1, %k1 7036; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 7037; AVX512BW-NEXT: kandw %k0, %k1, %k1 7038; AVX512BW-NEXT: kshiftrw $7, %k4, %k5 7039; AVX512BW-NEXT: korw %k5, %k1, %k1 7040; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 7041; AVX512BW-NEXT: kandw %k5, %k1, %k1 7042; AVX512BW-NEXT: kshiftrw $6, %k4, %k5 7043; AVX512BW-NEXT: korw %k5, %k1, %k1 7044; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 7045; AVX512BW-NEXT: kandw %k5, %k1, %k1 7046; AVX512BW-NEXT: kshiftrw $5, %k4, %k5 7047; AVX512BW-NEXT: korw %k5, %k1, %k1 7048; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 7049; AVX512BW-NEXT: kandw %k5, %k1, %k1 7050; AVX512BW-NEXT: kshiftrw $4, %k4, %k4 7051; AVX512BW-NEXT: korw %k4, %k1, %k1 7052; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 7053; AVX512BW-NEXT: kandw %k2, %k1, %k1 7054; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 7055; AVX512BW-NEXT: kshiftrw $3, %k5, %k4 7056; AVX512BW-NEXT: korw %k4, %k1, %k1 7057; AVX512BW-NEXT: kandw %k7, %k1, %k1 7058; AVX512BW-NEXT: kshiftrw $2, %k5, %k4 7059; AVX512BW-NEXT: korw %k4, %k1, %k1 7060; AVX512BW-NEXT: kandw %k3, %k1, %k1 7061; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload 7062; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 7063; AVX512BW-NEXT: korw %k2, %k1, %k1 7064; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 7065; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 7066; AVX512BW-NEXT: korw %k5, %k1, %k1 7067; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} 7068; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload 7069; AVX512BW-NEXT: kshiftrd $5, %k1, %k2 7070; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 7071; AVX512BW-NEXT: kandw %k6, %k2, %k3 7072; AVX512BW-NEXT: kshiftlw $15, %k2, %k7 7073; AVX512BW-NEXT: kshiftrw $14, %k7, %k4 7074; AVX512BW-NEXT: korw %k4, %k3, %k3 7075; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 7076; AVX512BW-NEXT: kandw %k2, %k3, %k3 7077; AVX512BW-NEXT: kshiftrw $13, %k7, %k4 7078; AVX512BW-NEXT: korw %k4, %k3, %k3 7079; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 7080; AVX512BW-NEXT: kandw %k2, %k3, %k3 7081; AVX512BW-NEXT: kshiftrw $12, %k7, %k4 7082; AVX512BW-NEXT: korw %k4, %k3, %k3 7083; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 7084; AVX512BW-NEXT: kandw %k2, %k3, %k3 7085; AVX512BW-NEXT: kshiftrd $6, %k1, %k4 7086; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 7087; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 7088; AVX512BW-NEXT: korw %k5, %k3, %k3 7089; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 7090; AVX512BW-NEXT: kandw %k2, %k3, %k3 7091; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 7092; AVX512BW-NEXT: korw %k5, %k3, %k3 7093; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 7094; AVX512BW-NEXT: kandw %k2, %k3, %k3 7095; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 7096; AVX512BW-NEXT: korw %k5, %k3, %k3 7097; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 7098; AVX512BW-NEXT: kandw %k2, %k3, %k3 7099; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 7100; AVX512BW-NEXT: korw %k5, %k3, %k3 7101; AVX512BW-NEXT: kandw %k0, %k3, %k3 7102; AVX512BW-NEXT: kshiftrw $7, %k4, %k5 7103; AVX512BW-NEXT: korw %k5, %k3, %k3 7104; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 7105; AVX512BW-NEXT: kandw %k5, %k3, %k3 7106; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 7107; AVX512BW-NEXT: korw %k4, %k3, %k3 7108; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 7109; AVX512BW-NEXT: kandw %k4, %k3, %k4 7110; AVX512BW-NEXT: kshiftrd $7, %k1, %k5 7111; AVX512BW-NEXT: kshiftlw $15, %k5, %k3 7112; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 7113; AVX512BW-NEXT: korw %k6, %k4, %k4 7114; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7115; AVX512BW-NEXT: kandw %k6, %k4, %k4 7116; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 7117; AVX512BW-NEXT: korw %k6, %k4, %k4 7118; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7119; AVX512BW-NEXT: kandw %k6, %k4, %k4 7120; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 7121; AVX512BW-NEXT: korw %k6, %k4, %k4 7122; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7123; AVX512BW-NEXT: kandw %k6, %k4, %k4 7124; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 7125; AVX512BW-NEXT: korw %k6, %k4, %k4 7126; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7127; AVX512BW-NEXT: kandw %k6, %k4, %k4 7128; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 7129; AVX512BW-NEXT: korw %k5, %k4, %k4 7130; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 7131; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 7132; AVX512BW-NEXT: korw %k3, %k4, %k3 7133; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k3} {z} 7134; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload 7135; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 7136; AVX512BW-NEXT: kandw %k4, %k3, %k3 7137; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 7138; AVX512BW-NEXT: kshiftrw $14, %k4, %k4 7139; AVX512BW-NEXT: korw %k4, %k3, %k3 7140; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 7141; AVX512BW-NEXT: kandw %k4, %k3, %k3 7142; AVX512BW-NEXT: kshiftrd $3, %k1, %k4 7143; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 7144; AVX512BW-NEXT: kshiftrw $13, %k4, %k5 7145; AVX512BW-NEXT: korw %k5, %k3, %k3 7146; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 7147; AVX512BW-NEXT: kandw %k5, %k3, %k3 7148; AVX512BW-NEXT: kshiftrw $12, %k4, %k5 7149; AVX512BW-NEXT: korw %k5, %k3, %k3 7150; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 7151; AVX512BW-NEXT: kandw %k5, %k3, %k3 7152; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 7153; AVX512BW-NEXT: korw %k5, %k3, %k3 7154; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 7155; AVX512BW-NEXT: kandw %k5, %k3, %k3 7156; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 7157; AVX512BW-NEXT: korw %k5, %k3, %k3 7158; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 7159; AVX512BW-NEXT: kandw %k5, %k3, %k3 7160; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 7161; AVX512BW-NEXT: korw %k5, %k3, %k3 7162; AVX512BW-NEXT: kandw %k2, %k3, %k3 7163; AVX512BW-NEXT: kshiftrw $8, %k4, %k4 7164; AVX512BW-NEXT: korw %k4, %k3, %k3 7165; AVX512BW-NEXT: kandw %k0, %k3, %k3 7166; AVX512BW-NEXT: kshiftrd $4, %k1, %k0 7167; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 7168; AVX512BW-NEXT: kshiftrw $7, %k0, %k4 7169; AVX512BW-NEXT: korw %k4, %k3, %k3 7170; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 7171; AVX512BW-NEXT: kandw %k1, %k3, %k3 7172; AVX512BW-NEXT: kshiftrw $6, %k0, %k4 7173; AVX512BW-NEXT: korw %k4, %k3, %k3 7174; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 7175; AVX512BW-NEXT: kandw %k1, %k3, %k3 7176; AVX512BW-NEXT: kshiftrw $5, %k0, %k4 7177; AVX512BW-NEXT: korw %k4, %k3, %k3 7178; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 7179; AVX512BW-NEXT: kandw %k1, %k3, %k3 7180; AVX512BW-NEXT: kshiftrw $4, %k0, %k4 7181; AVX512BW-NEXT: korw %k4, %k3, %k3 7182; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 7183; AVX512BW-NEXT: kandw %k1, %k3, %k3 7184; AVX512BW-NEXT: kshiftrw $3, %k0, %k4 7185; AVX512BW-NEXT: korw %k4, %k3, %k3 7186; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 7187; AVX512BW-NEXT: kandw %k1, %k3, %k3 7188; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 7189; AVX512BW-NEXT: korw %k0, %k3, %k0 7190; AVX512BW-NEXT: kandw %k6, %k0, %k0 7191; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload 7192; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 7193; AVX512BW-NEXT: korw %k2, %k0, %k0 7194; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 7195; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 7196; AVX512BW-NEXT: korw %k7, %k0, %k1 7197; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm11 {%k1} {z} 7198; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rdx) 7199; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rdx) 7200; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rdx) 7201; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rdx) 7202; AVX512BW-NEXT: vmovdqa64 %zmm7, 320(%rdx) 7203; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) 7204; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%rdx) 7205; AVX512BW-NEXT: vmovdqa64 %zmm4, 512(%rdx) 7206; AVX512BW-NEXT: vmovdqa64 %zmm3, 576(%rdx) 7207; AVX512BW-NEXT: vmovdqa64 %zmm2, 640(%rdx) 7208; AVX512BW-NEXT: vmovdqa64 %zmm1, 704(%rdx) 7209; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) 7210; AVX512BW-NEXT: vzeroupper 7211; AVX512BW-NEXT: retq 7212 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 7213 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 7214 %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <192 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> 7215 %data = call <192 x i32> @llvm.masked.load.v192i32.p0(ptr %in.vec, i32 64, <192 x i1> %tgt.mask, <192 x i32> poison) 7216 store <192 x i32> %data, ptr %out.vec, align 64 7217 ret void 7218} 7219 7220define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 7221; AVX512F-ONLY-LABEL: mask_replication_factor6_vf64: 7222; AVX512F-ONLY: # %bb.0: 7223; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 7224; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 7225; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] 7226; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm4, %zmm1 7227; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 7228; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 7229; AVX512F-ONLY-NEXT: movw $1, %ax 7230; AVX512F-ONLY-NEXT: kmovw %eax, %k1 7231; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 7232; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 7233; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm7 {%k1} {z} = -1 7234; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 7235; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm8 {%k1} {z} = -1 7236; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 7237; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm9 {%k1} {z} = -1 7238; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 7239; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm10 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] 7240; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm10, %zmm1 7241; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm11 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] 7242; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm11, %zmm2 7243; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm12 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] 7244; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm12, %zmm3 7245; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm13 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] 7246; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm13, %zmm5 7247; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm14 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] 7248; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm14, %zmm6 7249; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm4, %zmm7 7250; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm10, %zmm15 7251; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm11, %zmm16 7252; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm12, %zmm17 7253; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm13, %zmm18 7254; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm14, %zmm19 7255; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm4, %zmm8 7256; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm10, %zmm20 7257; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm11, %zmm21 7258; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm12, %zmm22 7259; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm13, %zmm23 7260; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm4, %zmm24 7261; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm14, %zmm9 7262; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm10, %zmm10 7263; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm11 7264; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm12, %zmm12 7265; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm13 7266; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm14, %zmm4 7267; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 7268; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 7269; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} 7270; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 7271; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z} 7272; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 7273; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z} 7274; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 7275; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm11 {%k1} {z} 7276; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 7277; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm10 {%k1} {z} 7278; AVX512F-ONLY-NEXT: vptestmd %zmm24, %zmm24, %k1 7279; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm14 {%k1} {z} 7280; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 7281; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm9 {%k1} {z} 7282; AVX512F-ONLY-NEXT: vptestmd %zmm23, %zmm23, %k1 7283; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm23 {%k1} {z} 7284; AVX512F-ONLY-NEXT: vptestmd %zmm22, %zmm22, %k1 7285; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm22 {%k1} {z} 7286; AVX512F-ONLY-NEXT: vptestmd %zmm21, %zmm21, %k1 7287; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm21 {%k1} {z} 7288; AVX512F-ONLY-NEXT: vptestmd %zmm20, %zmm20, %k1 7289; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm20 {%k1} {z} 7290; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 7291; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm8 {%k1} {z} 7292; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1 7293; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm19 {%k1} {z} 7294; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 7295; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm18 {%k1} {z} 7296; AVX512F-ONLY-NEXT: vptestmd %zmm17, %zmm17, %k1 7297; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm17 {%k1} {z} 7298; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1 7299; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} 7300; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 7301; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm15 {%k1} {z} 7302; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 7303; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm7 {%k1} {z} 7304; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 7305; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm6 {%k1} {z} 7306; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 7307; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm5 {%k1} {z} 7308; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 7309; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm3 {%k1} {z} 7310; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 7311; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm2 {%k1} {z} 7312; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 7313; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm1 {%k1} {z} 7314; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 1472(%rdx) 7315; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 1408(%rdx) 7316; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 1344(%rdx) 7317; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 1280(%rdx) 7318; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1216(%rdx) 7319; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 1152(%rdx) 7320; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 1088(%rdx) 7321; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 1024(%rdx) 7322; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 960(%rdx) 7323; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 896(%rdx) 7324; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 832(%rdx) 7325; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 768(%rdx) 7326; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 704(%rdx) 7327; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 640(%rdx) 7328; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 576(%rdx) 7329; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 512(%rdx) 7330; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 448(%rdx) 7331; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 384(%rdx) 7332; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 320(%rdx) 7333; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 256(%rdx) 7334; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 192(%rdx) 7335; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 128(%rdx) 7336; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx) 7337; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) 7338; AVX512F-ONLY-NEXT: vzeroupper 7339; AVX512F-ONLY-NEXT: retq 7340; 7341; AVX512DQ-LABEL: mask_replication_factor6_vf64: 7342; AVX512DQ: # %bb.0: 7343; AVX512DQ-NEXT: kmovw (%rdi), %k0 7344; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 7345; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] 7346; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm1 7347; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 7348; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 7349; AVX512DQ-NEXT: movw $1, %ax 7350; AVX512DQ-NEXT: kmovw %eax, %k1 7351; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 7352; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 7353; AVX512DQ-NEXT: vpmovm2d %k0, %zmm7 7354; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 7355; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8 7356; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 7357; AVX512DQ-NEXT: vpmovm2d %k0, %zmm9 7358; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 7359; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] 7360; AVX512DQ-NEXT: vpermd %zmm7, %zmm10, %zmm1 7361; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] 7362; AVX512DQ-NEXT: vpermd %zmm7, %zmm11, %zmm2 7363; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] 7364; AVX512DQ-NEXT: vpermd %zmm7, %zmm12, %zmm3 7365; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] 7366; AVX512DQ-NEXT: vpermd %zmm7, %zmm13, %zmm5 7367; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] 7368; AVX512DQ-NEXT: vpermd %zmm7, %zmm14, %zmm6 7369; AVX512DQ-NEXT: vpermd %zmm7, %zmm4, %zmm7 7370; AVX512DQ-NEXT: vpermd %zmm8, %zmm10, %zmm15 7371; AVX512DQ-NEXT: vpermd %zmm8, %zmm11, %zmm16 7372; AVX512DQ-NEXT: vpermd %zmm8, %zmm12, %zmm17 7373; AVX512DQ-NEXT: vpermd %zmm8, %zmm13, %zmm18 7374; AVX512DQ-NEXT: vpermd %zmm8, %zmm14, %zmm19 7375; AVX512DQ-NEXT: vpermd %zmm8, %zmm4, %zmm8 7376; AVX512DQ-NEXT: vpermd %zmm9, %zmm10, %zmm20 7377; AVX512DQ-NEXT: vpermd %zmm9, %zmm11, %zmm21 7378; AVX512DQ-NEXT: vpermd %zmm9, %zmm12, %zmm22 7379; AVX512DQ-NEXT: vpermd %zmm9, %zmm13, %zmm23 7380; AVX512DQ-NEXT: vpermd %zmm9, %zmm4, %zmm24 7381; AVX512DQ-NEXT: vpermd %zmm9, %zmm14, %zmm9 7382; AVX512DQ-NEXT: vpermd %zmm0, %zmm10, %zmm10 7383; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm11 7384; AVX512DQ-NEXT: vpermd %zmm0, %zmm12, %zmm12 7385; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm13 7386; AVX512DQ-NEXT: vpermd %zmm0, %zmm14, %zmm4 7387; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 7388; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 7389; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} 7390; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 7391; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z} 7392; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 7393; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z} 7394; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 7395; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm11 {%k1} {z} 7396; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 7397; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm10 {%k1} {z} 7398; AVX512DQ-NEXT: vpmovd2m %zmm24, %k1 7399; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm14 {%k1} {z} 7400; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 7401; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm9 {%k1} {z} 7402; AVX512DQ-NEXT: vpmovd2m %zmm23, %k1 7403; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm23 {%k1} {z} 7404; AVX512DQ-NEXT: vpmovd2m %zmm22, %k1 7405; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm22 {%k1} {z} 7406; AVX512DQ-NEXT: vpmovd2m %zmm21, %k1 7407; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm21 {%k1} {z} 7408; AVX512DQ-NEXT: vpmovd2m %zmm20, %k1 7409; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm20 {%k1} {z} 7410; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 7411; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm8 {%k1} {z} 7412; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1 7413; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm19 {%k1} {z} 7414; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 7415; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm18 {%k1} {z} 7416; AVX512DQ-NEXT: vpmovd2m %zmm17, %k1 7417; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm17 {%k1} {z} 7418; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1 7419; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} 7420; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 7421; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm15 {%k1} {z} 7422; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 7423; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm7 {%k1} {z} 7424; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 7425; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm6 {%k1} {z} 7426; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 7427; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm5 {%k1} {z} 7428; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 7429; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm3 {%k1} {z} 7430; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 7431; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm2 {%k1} {z} 7432; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 7433; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm1 {%k1} {z} 7434; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1472(%rdx) 7435; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1408(%rdx) 7436; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1344(%rdx) 7437; AVX512DQ-NEXT: vmovdqa64 %zmm5, 1280(%rdx) 7438; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1216(%rdx) 7439; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1152(%rdx) 7440; AVX512DQ-NEXT: vmovdqa64 %zmm15, 1088(%rdx) 7441; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1024(%rdx) 7442; AVX512DQ-NEXT: vmovdqa64 %zmm17, 960(%rdx) 7443; AVX512DQ-NEXT: vmovdqa64 %zmm18, 896(%rdx) 7444; AVX512DQ-NEXT: vmovdqa64 %zmm19, 832(%rdx) 7445; AVX512DQ-NEXT: vmovdqa64 %zmm8, 768(%rdx) 7446; AVX512DQ-NEXT: vmovdqa64 %zmm20, 704(%rdx) 7447; AVX512DQ-NEXT: vmovdqa64 %zmm21, 640(%rdx) 7448; AVX512DQ-NEXT: vmovdqa64 %zmm22, 576(%rdx) 7449; AVX512DQ-NEXT: vmovdqa64 %zmm23, 512(%rdx) 7450; AVX512DQ-NEXT: vmovdqa64 %zmm9, 448(%rdx) 7451; AVX512DQ-NEXT: vmovdqa64 %zmm14, 384(%rdx) 7452; AVX512DQ-NEXT: vmovdqa64 %zmm10, 320(%rdx) 7453; AVX512DQ-NEXT: vmovdqa64 %zmm11, 256(%rdx) 7454; AVX512DQ-NEXT: vmovdqa64 %zmm12, 192(%rdx) 7455; AVX512DQ-NEXT: vmovdqa64 %zmm13, 128(%rdx) 7456; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx) 7457; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) 7458; AVX512DQ-NEXT: vzeroupper 7459; AVX512DQ-NEXT: retq 7460; 7461; AVX512BW-LABEL: mask_replication_factor6_vf64: 7462; AVX512BW: # %bb.0: 7463; AVX512BW-NEXT: kmovq (%rdi), %k5 7464; AVX512BW-NEXT: movw $-3, %ax 7465; AVX512BW-NEXT: kmovd %eax, %k1 7466; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 7467; AVX512BW-NEXT: kmovw (%rdi), %k0 7468; AVX512BW-NEXT: kandw %k1, %k0, %k3 7469; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 7470; AVX512BW-NEXT: kshiftrw $14, %k1, %k0 7471; AVX512BW-NEXT: korw %k0, %k3, %k0 7472; AVX512BW-NEXT: movw $-5, %ax 7473; AVX512BW-NEXT: kmovd %eax, %k2 7474; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 7475; AVX512BW-NEXT: kandw %k2, %k0, %k0 7476; AVX512BW-NEXT: kshiftrw $13, %k1, %k3 7477; AVX512BW-NEXT: korw %k3, %k0, %k0 7478; AVX512BW-NEXT: movw $-9, %ax 7479; AVX512BW-NEXT: kmovd %eax, %k2 7480; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 7481; AVX512BW-NEXT: kandw %k2, %k0, %k0 7482; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 7483; AVX512BW-NEXT: korw %k3, %k0, %k0 7484; AVX512BW-NEXT: movw $-17, %ax 7485; AVX512BW-NEXT: kmovd %eax, %k2 7486; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 7487; AVX512BW-NEXT: kandw %k2, %k0, %k0 7488; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 7489; AVX512BW-NEXT: korw %k3, %k0, %k0 7490; AVX512BW-NEXT: movw $-33, %ax 7491; AVX512BW-NEXT: kmovd %eax, %k2 7492; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 7493; AVX512BW-NEXT: kandw %k2, %k0, %k0 7494; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 7495; AVX512BW-NEXT: korw %k1, %k0, %k0 7496; AVX512BW-NEXT: movw $-65, %ax 7497; AVX512BW-NEXT: kmovd %eax, %k1 7498; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 7499; AVX512BW-NEXT: kandw %k1, %k0, %k0 7500; AVX512BW-NEXT: kshiftrq $1, %k5, %k1 7501; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 7502; AVX512BW-NEXT: kshiftrw $9, %k1, %k3 7503; AVX512BW-NEXT: korw %k3, %k0, %k0 7504; AVX512BW-NEXT: movw $-129, %ax 7505; AVX512BW-NEXT: kmovd %eax, %k2 7506; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 7507; AVX512BW-NEXT: kandw %k2, %k0, %k0 7508; AVX512BW-NEXT: kshiftrw $8, %k1, %k3 7509; AVX512BW-NEXT: korw %k3, %k0, %k0 7510; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF 7511; AVX512BW-NEXT: kmovd %eax, %k2 7512; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 7513; AVX512BW-NEXT: kandw %k2, %k0, %k0 7514; AVX512BW-NEXT: kshiftrw $7, %k1, %k3 7515; AVX512BW-NEXT: korw %k3, %k0, %k0 7516; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF 7517; AVX512BW-NEXT: kmovd %eax, %k2 7518; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 7519; AVX512BW-NEXT: kandw %k2, %k0, %k0 7520; AVX512BW-NEXT: kshiftrw $6, %k1, %k3 7521; AVX512BW-NEXT: korw %k3, %k0, %k0 7522; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF 7523; AVX512BW-NEXT: kmovd %eax, %k2 7524; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 7525; AVX512BW-NEXT: kandw %k2, %k0, %k0 7526; AVX512BW-NEXT: kshiftrw $5, %k1, %k3 7527; AVX512BW-NEXT: korw %k3, %k0, %k0 7528; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF 7529; AVX512BW-NEXT: kmovd %eax, %k2 7530; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 7531; AVX512BW-NEXT: kandw %k2, %k0, %k0 7532; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 7533; AVX512BW-NEXT: korw %k1, %k0, %k0 7534; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF 7535; AVX512BW-NEXT: kmovd %eax, %k1 7536; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 7537; AVX512BW-NEXT: kandw %k1, %k0, %k3 7538; AVX512BW-NEXT: kshiftrq $2, %k5, %k1 7539; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 7540; AVX512BW-NEXT: kshiftrw $3, %k0, %k4 7541; AVX512BW-NEXT: korw %k4, %k3, %k3 7542; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF 7543; AVX512BW-NEXT: kmovd %eax, %k2 7544; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 7545; AVX512BW-NEXT: kandw %k2, %k3, %k3 7546; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 7547; AVX512BW-NEXT: korw %k7, %k3, %k7 7548; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF 7549; AVX512BW-NEXT: kmovd %eax, %k2 7550; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 7551; AVX512BW-NEXT: kandw %k2, %k7, %k7 7552; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 7553; AVX512BW-NEXT: korw %k6, %k7, %k6 7554; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 7555; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 7556; AVX512BW-NEXT: korw %k0, %k6, %k6 7557; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z} 7558; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 7559; AVX512BW-NEXT: kandw %k4, %k1, %k1 7560; AVX512BW-NEXT: kshiftrw $14, %k0, %k0 7561; AVX512BW-NEXT: korw %k0, %k1, %k0 7562; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 7563; AVX512BW-NEXT: kandw %k1, %k0, %k0 7564; AVX512BW-NEXT: kmovq %k5, %k3 7565; AVX512BW-NEXT: kshiftrq $3, %k5, %k1 7566; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 7567; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 7568; AVX512BW-NEXT: korw %k6, %k0, %k0 7569; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 7570; AVX512BW-NEXT: kandw %k2, %k0, %k0 7571; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 7572; AVX512BW-NEXT: korw %k6, %k0, %k0 7573; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7574; AVX512BW-NEXT: kandw %k6, %k0, %k0 7575; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 7576; AVX512BW-NEXT: korw %k6, %k0, %k0 7577; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 7578; AVX512BW-NEXT: kandw %k5, %k0, %k0 7579; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 7580; AVX512BW-NEXT: korw %k6, %k0, %k0 7581; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7582; AVX512BW-NEXT: kandw %k6, %k0, %k0 7583; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 7584; AVX512BW-NEXT: korw %k6, %k0, %k0 7585; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7586; AVX512BW-NEXT: kandw %k6, %k0, %k0 7587; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 7588; AVX512BW-NEXT: korw %k1, %k0, %k0 7589; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 7590; AVX512BW-NEXT: kandw %k1, %k0, %k0 7591; AVX512BW-NEXT: kshiftrq $4, %k3, %k1 7592; AVX512BW-NEXT: kmovq %k3, %k7 7593; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 7594; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 7595; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 7596; AVX512BW-NEXT: korw %k6, %k0, %k0 7597; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 7598; AVX512BW-NEXT: kandw %k3, %k0, %k0 7599; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 7600; AVX512BW-NEXT: korw %k6, %k0, %k0 7601; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7602; AVX512BW-NEXT: kandw %k6, %k0, %k0 7603; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 7604; AVX512BW-NEXT: korw %k6, %k0, %k0 7605; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 7606; AVX512BW-NEXT: kandw %k5, %k0, %k0 7607; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 7608; AVX512BW-NEXT: korw %k6, %k0, %k0 7609; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 7610; AVX512BW-NEXT: kandw %k5, %k0, %k0 7611; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 7612; AVX512BW-NEXT: korw %k6, %k0, %k0 7613; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 7614; AVX512BW-NEXT: kandw %k5, %k0, %k0 7615; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 7616; AVX512BW-NEXT: korw %k1, %k0, %k0 7617; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 7618; AVX512BW-NEXT: kandw %k5, %k0, %k0 7619; AVX512BW-NEXT: kshiftrq $5, %k7, %k1 7620; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 7621; AVX512BW-NEXT: korw %k6, %k0, %k0 7622; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 7623; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 7624; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 7625; AVX512BW-NEXT: korw %k6, %k0, %k7 7626; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k7} {z} 7627; AVX512BW-NEXT: kandw %k4, %k1, %k0 7628; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 7629; AVX512BW-NEXT: korw %k1, %k0, %k0 7630; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 7631; AVX512BW-NEXT: kandw %k4, %k0, %k0 7632; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 7633; AVX512BW-NEXT: korw %k1, %k0, %k0 7634; AVX512BW-NEXT: kandw %k2, %k0, %k0 7635; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 7636; AVX512BW-NEXT: korw %k1, %k0, %k0 7637; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 7638; AVX512BW-NEXT: kandw %k1, %k0, %k0 7639; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload 7640; AVX512BW-NEXT: kshiftrq $6, %k7, %k1 7641; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 7642; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 7643; AVX512BW-NEXT: korw %k6, %k0, %k0 7644; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 7645; AVX512BW-NEXT: kandw %k2, %k0, %k0 7646; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 7647; AVX512BW-NEXT: korw %k6, %k0, %k0 7648; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7649; AVX512BW-NEXT: kandw %k6, %k0, %k0 7650; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 7651; AVX512BW-NEXT: korw %k6, %k0, %k0 7652; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7653; AVX512BW-NEXT: kandw %k6, %k0, %k0 7654; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 7655; AVX512BW-NEXT: korw %k6, %k0, %k0 7656; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7657; AVX512BW-NEXT: kandw %k6, %k0, %k0 7658; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 7659; AVX512BW-NEXT: korw %k6, %k0, %k0 7660; AVX512BW-NEXT: kandw %k3, %k0, %k0 7661; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 7662; AVX512BW-NEXT: korw %k1, %k0, %k0 7663; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 7664; AVX512BW-NEXT: kandw %k1, %k0, %k1 7665; AVX512BW-NEXT: kshiftrq $7, %k7, %k6 7666; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 7667; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 7668; AVX512BW-NEXT: korw %k7, %k1, %k1 7669; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 7670; AVX512BW-NEXT: kandw %k3, %k1, %k1 7671; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 7672; AVX512BW-NEXT: korw %k7, %k1, %k1 7673; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 7674; AVX512BW-NEXT: kandw %k3, %k1, %k1 7675; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 7676; AVX512BW-NEXT: korw %k7, %k1, %k1 7677; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 7678; AVX512BW-NEXT: kandw %k3, %k1, %k1 7679; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 7680; AVX512BW-NEXT: korw %k7, %k1, %k1 7681; AVX512BW-NEXT: kandw %k5, %k1, %k1 7682; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 7683; AVX512BW-NEXT: korw %k6, %k1, %k1 7684; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 7685; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 7686; AVX512BW-NEXT: korw %k0, %k1, %k1 7687; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} 7688; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload 7689; AVX512BW-NEXT: kshiftrq $8, %k7, %k0 7690; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 7691; AVX512BW-NEXT: kandw %k1, %k0, %k1 7692; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 7693; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 7694; AVX512BW-NEXT: korw %k6, %k1, %k1 7695; AVX512BW-NEXT: kandw %k4, %k1, %k1 7696; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 7697; AVX512BW-NEXT: korw %k6, %k1, %k1 7698; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 7699; AVX512BW-NEXT: kandw %k3, %k1, %k1 7700; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 7701; AVX512BW-NEXT: korw %k6, %k1, %k1 7702; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 7703; AVX512BW-NEXT: kandw %k3, %k1, %k1 7704; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 7705; AVX512BW-NEXT: korw %k6, %k1, %k1 7706; AVX512BW-NEXT: kandw %k2, %k1, %k1 7707; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 7708; AVX512BW-NEXT: korw %k0, %k1, %k0 7709; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 7710; AVX512BW-NEXT: kandw %k5, %k0, %k0 7711; AVX512BW-NEXT: kshiftrq $9, %k7, %k1 7712; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 7713; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 7714; AVX512BW-NEXT: korw %k6, %k0, %k0 7715; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 7716; AVX512BW-NEXT: kandw %k2, %k0, %k0 7717; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 7718; AVX512BW-NEXT: korw %k6, %k0, %k0 7719; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 7720; AVX512BW-NEXT: kandw %k2, %k0, %k0 7721; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 7722; AVX512BW-NEXT: korw %k6, %k0, %k0 7723; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 7724; AVX512BW-NEXT: kandw %k2, %k0, %k0 7725; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 7726; AVX512BW-NEXT: korw %k6, %k0, %k0 7727; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 7728; AVX512BW-NEXT: kandw %k4, %k0, %k0 7729; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 7730; AVX512BW-NEXT: korw %k6, %k0, %k0 7731; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 7732; AVX512BW-NEXT: kandw %k2, %k0, %k0 7733; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 7734; AVX512BW-NEXT: korw %k1, %k0, %k0 7735; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 7736; AVX512BW-NEXT: kandw %k1, %k0, %k0 7737; AVX512BW-NEXT: kshiftrq $10, %k7, %k1 7738; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 7739; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 7740; AVX512BW-NEXT: korw %k7, %k0, %k0 7741; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 7742; AVX512BW-NEXT: kandw %k2, %k0, %k0 7743; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 7744; AVX512BW-NEXT: korw %k7, %k0, %k0 7745; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 7746; AVX512BW-NEXT: kandw %k7, %k0, %k0 7747; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 7748; AVX512BW-NEXT: korw %k7, %k0, %k0 7749; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 7750; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 7751; AVX512BW-NEXT: korw %k6, %k0, %k7 7752; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k7} {z} 7753; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 7754; AVX512BW-NEXT: kandw %k0, %k1, %k0 7755; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 7756; AVX512BW-NEXT: korw %k1, %k0, %k0 7757; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 7758; AVX512BW-NEXT: kandw %k1, %k0, %k0 7759; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload 7760; AVX512BW-NEXT: kshiftrq $11, %k7, %k1 7761; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 7762; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 7763; AVX512BW-NEXT: korw %k6, %k0, %k0 7764; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7765; AVX512BW-NEXT: kandw %k6, %k0, %k0 7766; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 7767; AVX512BW-NEXT: korw %k6, %k0, %k0 7768; AVX512BW-NEXT: kandw %k3, %k0, %k0 7769; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 7770; AVX512BW-NEXT: korw %k6, %k0, %k0 7771; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 7772; AVX512BW-NEXT: kandw %k3, %k0, %k0 7773; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 7774; AVX512BW-NEXT: korw %k6, %k0, %k0 7775; AVX512BW-NEXT: kandw %k5, %k0, %k0 7776; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 7777; AVX512BW-NEXT: korw %k6, %k0, %k0 7778; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 7779; AVX512BW-NEXT: kandw %k5, %k0, %k0 7780; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 7781; AVX512BW-NEXT: korw %k1, %k0, %k0 7782; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 7783; AVX512BW-NEXT: kandw %k3, %k0, %k0 7784; AVX512BW-NEXT: kshiftrq $12, %k7, %k1 7785; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 7786; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 7787; AVX512BW-NEXT: korw %k6, %k0, %k0 7788; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7789; AVX512BW-NEXT: kandw %k6, %k0, %k0 7790; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 7791; AVX512BW-NEXT: korw %k6, %k0, %k0 7792; AVX512BW-NEXT: kandw %k4, %k0, %k0 7793; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 7794; AVX512BW-NEXT: korw %k6, %k0, %k0 7795; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 7796; AVX512BW-NEXT: kandw %k4, %k0, %k0 7797; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 7798; AVX512BW-NEXT: korw %k6, %k0, %k0 7799; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 7800; AVX512BW-NEXT: kandw %k4, %k0, %k0 7801; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 7802; AVX512BW-NEXT: korw %k6, %k0, %k0 7803; AVX512BW-NEXT: kandw %k2, %k0, %k0 7804; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 7805; AVX512BW-NEXT: korw %k1, %k0, %k0 7806; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 7807; AVX512BW-NEXT: kandw %k1, %k0, %k0 7808; AVX512BW-NEXT: kshiftrq $13, %k7, %k1 7809; AVX512BW-NEXT: kmovq %k7, %k2 7810; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 7811; AVX512BW-NEXT: korw %k6, %k0, %k0 7812; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 7813; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 7814; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 7815; AVX512BW-NEXT: korw %k6, %k0, %k7 7816; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k7} {z} 7817; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 7818; AVX512BW-NEXT: kandw %k0, %k1, %k0 7819; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 7820; AVX512BW-NEXT: korw %k1, %k0, %k0 7821; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 7822; AVX512BW-NEXT: kandw %k1, %k0, %k0 7823; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 7824; AVX512BW-NEXT: korw %k1, %k0, %k0 7825; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 7826; AVX512BW-NEXT: kandw %k1, %k0, %k0 7827; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 7828; AVX512BW-NEXT: korw %k1, %k0, %k0 7829; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 7830; AVX512BW-NEXT: kandw %k1, %k0, %k0 7831; AVX512BW-NEXT: kmovq %k2, %k7 7832; AVX512BW-NEXT: kshiftrq $14, %k2, %k1 7833; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 7834; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 7835; AVX512BW-NEXT: korw %k6, %k0, %k0 7836; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 7837; AVX512BW-NEXT: kandw %k2, %k0, %k0 7838; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 7839; AVX512BW-NEXT: korw %k6, %k0, %k0 7840; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 7841; AVX512BW-NEXT: kandw %k2, %k0, %k0 7842; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 7843; AVX512BW-NEXT: korw %k6, %k0, %k0 7844; AVX512BW-NEXT: kandw %k5, %k0, %k0 7845; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 7846; AVX512BW-NEXT: korw %k6, %k0, %k0 7847; AVX512BW-NEXT: kandw %k3, %k0, %k0 7848; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 7849; AVX512BW-NEXT: korw %k6, %k0, %k0 7850; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 7851; AVX512BW-NEXT: kandw %k2, %k0, %k0 7852; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 7853; AVX512BW-NEXT: korw %k1, %k0, %k0 7854; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 7855; AVX512BW-NEXT: kandw %k2, %k0, %k1 7856; AVX512BW-NEXT: kshiftrq $15, %k7, %k6 7857; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 7858; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 7859; AVX512BW-NEXT: korw %k7, %k1, %k1 7860; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 7861; AVX512BW-NEXT: kandw %k3, %k1, %k1 7862; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 7863; AVX512BW-NEXT: korw %k7, %k1, %k1 7864; AVX512BW-NEXT: kandw %k4, %k1, %k1 7865; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 7866; AVX512BW-NEXT: korw %k7, %k1, %k1 7867; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 7868; AVX512BW-NEXT: kandw %k5, %k1, %k1 7869; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 7870; AVX512BW-NEXT: korw %k7, %k1, %k1 7871; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 7872; AVX512BW-NEXT: kandw %k5, %k1, %k1 7873; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 7874; AVX512BW-NEXT: korw %k6, %k1, %k1 7875; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 7876; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 7877; AVX512BW-NEXT: korw %k0, %k1, %k1 7878; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} 7879; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload 7880; AVX512BW-NEXT: kshiftrq $16, %k7, %k0 7881; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 7882; AVX512BW-NEXT: kandw %k1, %k0, %k1 7883; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 7884; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 7885; AVX512BW-NEXT: korw %k6, %k1, %k1 7886; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7887; AVX512BW-NEXT: kandw %k6, %k1, %k1 7888; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 7889; AVX512BW-NEXT: korw %k6, %k1, %k1 7890; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7891; AVX512BW-NEXT: kandw %k6, %k1, %k1 7892; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 7893; AVX512BW-NEXT: korw %k6, %k1, %k1 7894; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7895; AVX512BW-NEXT: kandw %k6, %k1, %k1 7896; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 7897; AVX512BW-NEXT: korw %k6, %k1, %k1 7898; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7899; AVX512BW-NEXT: kandw %k6, %k1, %k1 7900; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 7901; AVX512BW-NEXT: korw %k0, %k1, %k0 7902; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 7903; AVX512BW-NEXT: kandw %k1, %k0, %k0 7904; AVX512BW-NEXT: kshiftrq $17, %k7, %k1 7905; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 7906; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 7907; AVX512BW-NEXT: korw %k6, %k0, %k0 7908; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 7909; AVX512BW-NEXT: kandw %k5, %k0, %k0 7910; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 7911; AVX512BW-NEXT: korw %k6, %k0, %k0 7912; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 7913; AVX512BW-NEXT: kandw %k5, %k0, %k0 7914; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 7915; AVX512BW-NEXT: korw %k6, %k0, %k0 7916; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7917; AVX512BW-NEXT: kandw %k6, %k0, %k0 7918; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 7919; AVX512BW-NEXT: korw %k6, %k0, %k0 7920; AVX512BW-NEXT: kandw %k2, %k0, %k0 7921; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 7922; AVX512BW-NEXT: korw %k6, %k0, %k0 7923; AVX512BW-NEXT: kandw %k3, %k0, %k0 7924; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 7925; AVX512BW-NEXT: korw %k1, %k0, %k0 7926; AVX512BW-NEXT: kandw %k4, %k0, %k0 7927; AVX512BW-NEXT: kmovq %k7, %k4 7928; AVX512BW-NEXT: kshiftrq $18, %k7, %k1 7929; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 7930; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 7931; AVX512BW-NEXT: korw %k7, %k0, %k0 7932; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 7933; AVX512BW-NEXT: kandw %k3, %k0, %k0 7934; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 7935; AVX512BW-NEXT: korw %k7, %k0, %k0 7936; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 7937; AVX512BW-NEXT: kandw %k2, %k0, %k0 7938; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 7939; AVX512BW-NEXT: korw %k7, %k0, %k0 7940; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 7941; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 7942; AVX512BW-NEXT: korw %k6, %k0, %k7 7943; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k7} {z} 7944; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 7945; AVX512BW-NEXT: kandw %k0, %k1, %k0 7946; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 7947; AVX512BW-NEXT: korw %k1, %k0, %k0 7948; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 7949; AVX512BW-NEXT: kandw %k1, %k0, %k0 7950; AVX512BW-NEXT: kshiftrq $19, %k4, %k1 7951; AVX512BW-NEXT: kmovq %k4, %k7 7952; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 7953; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 7954; AVX512BW-NEXT: korw %k6, %k0, %k0 7955; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 7956; AVX512BW-NEXT: kandw %k4, %k0, %k0 7957; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 7958; AVX512BW-NEXT: korw %k6, %k0, %k0 7959; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 7960; AVX512BW-NEXT: kandw %k5, %k0, %k0 7961; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 7962; AVX512BW-NEXT: korw %k6, %k0, %k0 7963; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7964; AVX512BW-NEXT: kandw %k6, %k0, %k0 7965; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 7966; AVX512BW-NEXT: korw %k6, %k0, %k0 7967; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7968; AVX512BW-NEXT: kandw %k6, %k0, %k0 7969; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 7970; AVX512BW-NEXT: korw %k6, %k0, %k0 7971; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7972; AVX512BW-NEXT: kandw %k6, %k0, %k0 7973; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 7974; AVX512BW-NEXT: korw %k1, %k0, %k0 7975; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 7976; AVX512BW-NEXT: kandw %k1, %k0, %k0 7977; AVX512BW-NEXT: kshiftrq $20, %k7, %k1 7978; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 7979; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 7980; AVX512BW-NEXT: korw %k6, %k0, %k0 7981; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7982; AVX512BW-NEXT: kandw %k6, %k0, %k0 7983; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 7984; AVX512BW-NEXT: korw %k6, %k0, %k0 7985; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7986; AVX512BW-NEXT: kandw %k6, %k0, %k0 7987; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 7988; AVX512BW-NEXT: korw %k6, %k0, %k0 7989; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7990; AVX512BW-NEXT: kandw %k6, %k0, %k0 7991; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 7992; AVX512BW-NEXT: korw %k6, %k0, %k0 7993; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 7994; AVX512BW-NEXT: kandw %k6, %k0, %k0 7995; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 7996; AVX512BW-NEXT: korw %k6, %k0, %k0 7997; AVX512BW-NEXT: kandw %k3, %k0, %k0 7998; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 7999; AVX512BW-NEXT: korw %k1, %k0, %k0 8000; AVX512BW-NEXT: kandw %k2, %k0, %k0 8001; AVX512BW-NEXT: kshiftrq $21, %k7, %k1 8002; AVX512BW-NEXT: kmovq %k7, %k3 8003; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 8004; AVX512BW-NEXT: korw %k6, %k0, %k0 8005; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 8006; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 8007; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 8008; AVX512BW-NEXT: korw %k6, %k0, %k7 8009; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k7} {z} 8010; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8011; AVX512BW-NEXT: kandw %k2, %k1, %k0 8012; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 8013; AVX512BW-NEXT: korw %k1, %k0, %k0 8014; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8015; AVX512BW-NEXT: kandw %k1, %k0, %k0 8016; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 8017; AVX512BW-NEXT: korw %k1, %k0, %k0 8018; AVX512BW-NEXT: kandw %k4, %k0, %k0 8019; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 8020; AVX512BW-NEXT: korw %k1, %k0, %k0 8021; AVX512BW-NEXT: kandw %k5, %k0, %k0 8022; AVX512BW-NEXT: kmovq %k3, %k5 8023; AVX512BW-NEXT: kshiftrq $22, %k3, %k1 8024; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 8025; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 8026; AVX512BW-NEXT: korw %k6, %k0, %k0 8027; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8028; AVX512BW-NEXT: kandw %k3, %k0, %k0 8029; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 8030; AVX512BW-NEXT: korw %k6, %k0, %k0 8031; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 8032; AVX512BW-NEXT: kandw %k4, %k0, %k0 8033; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 8034; AVX512BW-NEXT: korw %k6, %k0, %k0 8035; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 8036; AVX512BW-NEXT: kandw %k4, %k0, %k0 8037; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 8038; AVX512BW-NEXT: korw %k6, %k0, %k0 8039; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 8040; AVX512BW-NEXT: kandw %k4, %k0, %k0 8041; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 8042; AVX512BW-NEXT: korw %k6, %k0, %k0 8043; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 8044; AVX512BW-NEXT: kandw %k4, %k0, %k0 8045; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 8046; AVX512BW-NEXT: korw %k1, %k0, %k0 8047; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8048; AVX512BW-NEXT: kandw %k1, %k0, %k1 8049; AVX512BW-NEXT: kshiftrq $23, %k5, %k6 8050; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 8051; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 8052; AVX512BW-NEXT: korw %k7, %k1, %k1 8053; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8054; AVX512BW-NEXT: kandw %k5, %k1, %k1 8055; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 8056; AVX512BW-NEXT: korw %k7, %k1, %k1 8057; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8058; AVX512BW-NEXT: kandw %k5, %k1, %k1 8059; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 8060; AVX512BW-NEXT: korw %k7, %k1, %k1 8061; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8062; AVX512BW-NEXT: kandw %k5, %k1, %k1 8063; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 8064; AVX512BW-NEXT: korw %k7, %k1, %k1 8065; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8066; AVX512BW-NEXT: kandw %k5, %k1, %k1 8067; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 8068; AVX512BW-NEXT: korw %k6, %k1, %k1 8069; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 8070; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 8071; AVX512BW-NEXT: korw %k0, %k1, %k1 8072; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} 8073; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload 8074; AVX512BW-NEXT: kshiftrq $24, %k5, %k0 8075; AVX512BW-NEXT: kandw %k2, %k0, %k1 8076; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 8077; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 8078; AVX512BW-NEXT: korw %k6, %k1, %k1 8079; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8080; AVX512BW-NEXT: kandw %k2, %k1, %k1 8081; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 8082; AVX512BW-NEXT: korw %k6, %k1, %k1 8083; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8084; AVX512BW-NEXT: kandw %k6, %k1, %k1 8085; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 8086; AVX512BW-NEXT: korw %k6, %k1, %k1 8087; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8088; AVX512BW-NEXT: kandw %k6, %k1, %k1 8089; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 8090; AVX512BW-NEXT: korw %k6, %k1, %k1 8091; AVX512BW-NEXT: kandw %k3, %k1, %k1 8092; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 8093; AVX512BW-NEXT: korw %k0, %k1, %k0 8094; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8095; AVX512BW-NEXT: kandw %k1, %k0, %k0 8096; AVX512BW-NEXT: kshiftrq $25, %k5, %k1 8097; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 8098; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 8099; AVX512BW-NEXT: korw %k6, %k0, %k0 8100; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8101; AVX512BW-NEXT: kandw %k3, %k0, %k0 8102; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 8103; AVX512BW-NEXT: korw %k6, %k0, %k0 8104; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8105; AVX512BW-NEXT: kandw %k3, %k0, %k0 8106; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 8107; AVX512BW-NEXT: korw %k6, %k0, %k0 8108; AVX512BW-NEXT: kandw %k4, %k0, %k0 8109; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 8110; AVX512BW-NEXT: korw %k6, %k0, %k0 8111; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8112; AVX512BW-NEXT: kandw %k3, %k0, %k0 8113; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 8114; AVX512BW-NEXT: korw %k6, %k0, %k0 8115; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8116; AVX512BW-NEXT: kandw %k3, %k0, %k0 8117; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 8118; AVX512BW-NEXT: korw %k1, %k0, %k0 8119; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8120; AVX512BW-NEXT: kandw %k1, %k0, %k0 8121; AVX512BW-NEXT: kshiftrq $26, %k5, %k1 8122; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 8123; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 8124; AVX512BW-NEXT: korw %k7, %k0, %k0 8125; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8126; AVX512BW-NEXT: kandw %k3, %k0, %k0 8127; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 8128; AVX512BW-NEXT: korw %k7, %k0, %k0 8129; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8130; AVX512BW-NEXT: kandw %k3, %k0, %k0 8131; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 8132; AVX512BW-NEXT: korw %k7, %k0, %k0 8133; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 8134; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 8135; AVX512BW-NEXT: korw %k6, %k0, %k7 8136; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k7} {z} 8137; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 8138; AVX512BW-NEXT: kandw %k0, %k1, %k0 8139; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 8140; AVX512BW-NEXT: korw %k1, %k0, %k0 8141; AVX512BW-NEXT: kandw %k2, %k0, %k0 8142; AVX512BW-NEXT: kmovq %k5, %k7 8143; AVX512BW-NEXT: kshiftrq $27, %k5, %k1 8144; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 8145; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 8146; AVX512BW-NEXT: korw %k6, %k0, %k0 8147; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8148; AVX512BW-NEXT: kandw %k2, %k0, %k0 8149; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 8150; AVX512BW-NEXT: korw %k6, %k0, %k0 8151; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8152; AVX512BW-NEXT: kandw %k3, %k0, %k0 8153; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 8154; AVX512BW-NEXT: korw %k6, %k0, %k0 8155; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8156; AVX512BW-NEXT: kandw %k2, %k0, %k0 8157; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 8158; AVX512BW-NEXT: korw %k6, %k0, %k0 8159; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8160; AVX512BW-NEXT: kandw %k5, %k0, %k0 8161; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 8162; AVX512BW-NEXT: korw %k6, %k0, %k0 8163; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 8164; AVX512BW-NEXT: kandw %k4, %k0, %k0 8165; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 8166; AVX512BW-NEXT: korw %k1, %k0, %k0 8167; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8168; AVX512BW-NEXT: kandw %k1, %k0, %k0 8169; AVX512BW-NEXT: kshiftrq $28, %k7, %k1 8170; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 8171; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 8172; AVX512BW-NEXT: korw %k6, %k0, %k0 8173; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8174; AVX512BW-NEXT: kandw %k2, %k0, %k0 8175; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 8176; AVX512BW-NEXT: korw %k6, %k0, %k0 8177; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8178; AVX512BW-NEXT: kandw %k2, %k0, %k0 8179; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 8180; AVX512BW-NEXT: korw %k6, %k0, %k0 8181; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8182; AVX512BW-NEXT: kandw %k6, %k0, %k0 8183; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 8184; AVX512BW-NEXT: korw %k6, %k0, %k0 8185; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8186; AVX512BW-NEXT: kandw %k6, %k0, %k0 8187; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 8188; AVX512BW-NEXT: korw %k6, %k0, %k0 8189; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8190; AVX512BW-NEXT: kandw %k6, %k0, %k0 8191; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 8192; AVX512BW-NEXT: korw %k1, %k0, %k0 8193; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8194; AVX512BW-NEXT: kandw %k1, %k0, %k0 8195; AVX512BW-NEXT: kshiftrq $29, %k7, %k1 8196; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 8197; AVX512BW-NEXT: korw %k6, %k0, %k0 8198; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 8199; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 8200; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 8201; AVX512BW-NEXT: korw %k6, %k0, %k7 8202; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k7} {z} 8203; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 8204; AVX512BW-NEXT: kandw %k0, %k1, %k0 8205; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 8206; AVX512BW-NEXT: korw %k1, %k0, %k0 8207; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8208; AVX512BW-NEXT: kandw %k1, %k0, %k0 8209; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 8210; AVX512BW-NEXT: korw %k1, %k0, %k0 8211; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8212; AVX512BW-NEXT: kandw %k1, %k0, %k0 8213; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 8214; AVX512BW-NEXT: korw %k1, %k0, %k0 8215; AVX512BW-NEXT: kandw %k3, %k0, %k0 8216; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload 8217; AVX512BW-NEXT: kshiftrq $30, %k7, %k1 8218; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 8219; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 8220; AVX512BW-NEXT: korw %k6, %k0, %k0 8221; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8222; AVX512BW-NEXT: kandw %k3, %k0, %k0 8223; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 8224; AVX512BW-NEXT: korw %k6, %k0, %k0 8225; AVX512BW-NEXT: kandw %k5, %k0, %k0 8226; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 8227; AVX512BW-NEXT: korw %k6, %k0, %k0 8228; AVX512BW-NEXT: kandw %k4, %k0, %k0 8229; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 8230; AVX512BW-NEXT: korw %k6, %k0, %k0 8231; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8232; AVX512BW-NEXT: kandw %k5, %k0, %k0 8233; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 8234; AVX512BW-NEXT: korw %k6, %k0, %k0 8235; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8236; AVX512BW-NEXT: kandw %k3, %k0, %k0 8237; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 8238; AVX512BW-NEXT: korw %k1, %k0, %k0 8239; AVX512BW-NEXT: kandw %k2, %k0, %k1 8240; AVX512BW-NEXT: kshiftrq $31, %k7, %k6 8241; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 8242; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 8243; AVX512BW-NEXT: korw %k7, %k1, %k1 8244; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8245; AVX512BW-NEXT: kandw %k3, %k1, %k1 8246; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 8247; AVX512BW-NEXT: korw %k7, %k1, %k1 8248; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 8249; AVX512BW-NEXT: kandw %k4, %k1, %k1 8250; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 8251; AVX512BW-NEXT: korw %k7, %k1, %k1 8252; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8253; AVX512BW-NEXT: kandw %k2, %k1, %k1 8254; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 8255; AVX512BW-NEXT: korw %k7, %k1, %k1 8256; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8257; AVX512BW-NEXT: kandw %k2, %k1, %k1 8258; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 8259; AVX512BW-NEXT: korw %k6, %k1, %k1 8260; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 8261; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 8262; AVX512BW-NEXT: korw %k0, %k1, %k1 8263; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z} 8264; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload 8265; AVX512BW-NEXT: kshiftrq $32, %k7, %k0 8266; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8267; AVX512BW-NEXT: kandw %k1, %k0, %k1 8268; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 8269; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 8270; AVX512BW-NEXT: korw %k6, %k1, %k1 8271; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8272; AVX512BW-NEXT: kandw %k6, %k1, %k1 8273; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 8274; AVX512BW-NEXT: korw %k6, %k1, %k1 8275; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8276; AVX512BW-NEXT: kandw %k6, %k1, %k1 8277; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 8278; AVX512BW-NEXT: korw %k6, %k1, %k1 8279; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8280; AVX512BW-NEXT: kandw %k6, %k1, %k1 8281; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 8282; AVX512BW-NEXT: korw %k6, %k1, %k1 8283; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8284; AVX512BW-NEXT: kandw %k6, %k1, %k1 8285; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 8286; AVX512BW-NEXT: korw %k0, %k1, %k0 8287; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8288; AVX512BW-NEXT: kandw %k1, %k0, %k0 8289; AVX512BW-NEXT: kshiftrq $33, %k7, %k1 8290; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 8291; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 8292; AVX512BW-NEXT: korw %k6, %k0, %k0 8293; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8294; AVX512BW-NEXT: kandw %k2, %k0, %k0 8295; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 8296; AVX512BW-NEXT: korw %k6, %k0, %k0 8297; AVX512BW-NEXT: kandw %k5, %k0, %k0 8298; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 8299; AVX512BW-NEXT: korw %k6, %k0, %k0 8300; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8301; AVX512BW-NEXT: kandw %k2, %k0, %k0 8302; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 8303; AVX512BW-NEXT: korw %k6, %k0, %k0 8304; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8305; AVX512BW-NEXT: kandw %k5, %k0, %k0 8306; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 8307; AVX512BW-NEXT: korw %k6, %k0, %k0 8308; AVX512BW-NEXT: kandw %k3, %k0, %k0 8309; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 8310; AVX512BW-NEXT: korw %k1, %k0, %k0 8311; AVX512BW-NEXT: kandw %k4, %k0, %k0 8312; AVX512BW-NEXT: kmovq %k7, %k5 8313; AVX512BW-NEXT: kshiftrq $34, %k7, %k1 8314; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 8315; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 8316; AVX512BW-NEXT: korw %k7, %k0, %k0 8317; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8318; AVX512BW-NEXT: kandw %k3, %k0, %k0 8319; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 8320; AVX512BW-NEXT: korw %k7, %k0, %k0 8321; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 8322; AVX512BW-NEXT: kandw %k4, %k0, %k0 8323; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 8324; AVX512BW-NEXT: korw %k7, %k0, %k0 8325; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 8326; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 8327; AVX512BW-NEXT: korw %k6, %k0, %k7 8328; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k7} {z} 8329; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 8330; AVX512BW-NEXT: kandw %k0, %k1, %k0 8331; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 8332; AVX512BW-NEXT: korw %k1, %k0, %k0 8333; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8334; AVX512BW-NEXT: kandw %k1, %k0, %k0 8335; AVX512BW-NEXT: kshiftrq $35, %k5, %k1 8336; AVX512BW-NEXT: kmovq %k5, %k7 8337; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 8338; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 8339; AVX512BW-NEXT: korw %k6, %k0, %k0 8340; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8341; AVX512BW-NEXT: kandw %k5, %k0, %k0 8342; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 8343; AVX512BW-NEXT: korw %k6, %k0, %k0 8344; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8345; AVX512BW-NEXT: kandw %k5, %k0, %k0 8346; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 8347; AVX512BW-NEXT: korw %k6, %k0, %k0 8348; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8349; AVX512BW-NEXT: kandw %k5, %k0, %k0 8350; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 8351; AVX512BW-NEXT: korw %k6, %k0, %k0 8352; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8353; AVX512BW-NEXT: kandw %k5, %k0, %k0 8354; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 8355; AVX512BW-NEXT: korw %k6, %k0, %k0 8356; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8357; AVX512BW-NEXT: kandw %k5, %k0, %k0 8358; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 8359; AVX512BW-NEXT: korw %k1, %k0, %k0 8360; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8361; AVX512BW-NEXT: kandw %k1, %k0, %k0 8362; AVX512BW-NEXT: kshiftrq $36, %k7, %k1 8363; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 8364; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 8365; AVX512BW-NEXT: korw %k6, %k0, %k0 8366; AVX512BW-NEXT: kandw %k2, %k0, %k0 8367; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 8368; AVX512BW-NEXT: korw %k6, %k0, %k0 8369; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8370; AVX512BW-NEXT: kandw %k2, %k0, %k0 8371; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 8372; AVX512BW-NEXT: korw %k6, %k0, %k0 8373; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8374; AVX512BW-NEXT: kandw %k6, %k0, %k0 8375; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 8376; AVX512BW-NEXT: korw %k6, %k0, %k0 8377; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8378; AVX512BW-NEXT: kandw %k6, %k0, %k0 8379; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 8380; AVX512BW-NEXT: korw %k6, %k0, %k0 8381; AVX512BW-NEXT: kandw %k3, %k0, %k0 8382; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 8383; AVX512BW-NEXT: korw %k1, %k0, %k0 8384; AVX512BW-NEXT: kandw %k4, %k0, %k0 8385; AVX512BW-NEXT: kshiftrq $37, %k7, %k1 8386; AVX512BW-NEXT: kmovq %k7, %k3 8387; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 8388; AVX512BW-NEXT: korw %k6, %k0, %k0 8389; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 8390; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 8391; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 8392; AVX512BW-NEXT: korw %k6, %k0, %k7 8393; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k7} {z} 8394; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 8395; AVX512BW-NEXT: kandw %k0, %k1, %k0 8396; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 8397; AVX512BW-NEXT: korw %k1, %k0, %k0 8398; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8399; AVX512BW-NEXT: kandw %k1, %k0, %k0 8400; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 8401; AVX512BW-NEXT: korw %k1, %k0, %k0 8402; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 8403; AVX512BW-NEXT: kandw %k4, %k0, %k0 8404; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 8405; AVX512BW-NEXT: korw %k1, %k0, %k0 8406; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8407; AVX512BW-NEXT: kandw %k1, %k0, %k0 8408; AVX512BW-NEXT: kmovq %k3, %k7 8409; AVX512BW-NEXT: kshiftrq $38, %k3, %k1 8410; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 8411; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 8412; AVX512BW-NEXT: korw %k6, %k0, %k0 8413; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8414; AVX512BW-NEXT: kandw %k3, %k0, %k0 8415; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 8416; AVX512BW-NEXT: korw %k6, %k0, %k0 8417; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8418; AVX512BW-NEXT: kandw %k6, %k0, %k0 8419; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 8420; AVX512BW-NEXT: korw %k6, %k0, %k0 8421; AVX512BW-NEXT: kandw %k5, %k0, %k0 8422; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 8423; AVX512BW-NEXT: korw %k6, %k0, %k0 8424; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8425; AVX512BW-NEXT: kandw %k5, %k0, %k0 8426; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 8427; AVX512BW-NEXT: korw %k6, %k0, %k0 8428; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8429; AVX512BW-NEXT: kandw %k5, %k0, %k0 8430; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 8431; AVX512BW-NEXT: korw %k1, %k0, %k0 8432; AVX512BW-NEXT: kandw %k2, %k0, %k1 8433; AVX512BW-NEXT: kshiftrq $39, %k7, %k6 8434; AVX512BW-NEXT: kmovq %k7, %k5 8435; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 8436; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 8437; AVX512BW-NEXT: korw %k7, %k1, %k1 8438; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8439; AVX512BW-NEXT: kandw %k2, %k1, %k1 8440; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 8441; AVX512BW-NEXT: korw %k7, %k1, %k1 8442; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8443; AVX512BW-NEXT: kandw %k2, %k1, %k1 8444; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 8445; AVX512BW-NEXT: korw %k7, %k1, %k1 8446; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8447; AVX512BW-NEXT: kandw %k2, %k1, %k1 8448; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 8449; AVX512BW-NEXT: korw %k7, %k1, %k1 8450; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8451; AVX512BW-NEXT: kandw %k2, %k1, %k1 8452; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 8453; AVX512BW-NEXT: korw %k6, %k1, %k1 8454; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 8455; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 8456; AVX512BW-NEXT: korw %k0, %k1, %k1 8457; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k1} {z} 8458; AVX512BW-NEXT: kshiftrq $40, %k5, %k0 8459; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8460; AVX512BW-NEXT: kandw %k1, %k0, %k1 8461; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 8462; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 8463; AVX512BW-NEXT: korw %k6, %k1, %k1 8464; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8465; AVX512BW-NEXT: kandw %k2, %k1, %k1 8466; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 8467; AVX512BW-NEXT: korw %k6, %k1, %k1 8468; AVX512BW-NEXT: kandw %k4, %k1, %k1 8469; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 8470; AVX512BW-NEXT: korw %k6, %k1, %k1 8471; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8472; AVX512BW-NEXT: kandw %k6, %k1, %k1 8473; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 8474; AVX512BW-NEXT: korw %k6, %k1, %k1 8475; AVX512BW-NEXT: kandw %k3, %k1, %k1 8476; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 8477; AVX512BW-NEXT: korw %k0, %k1, %k0 8478; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8479; AVX512BW-NEXT: kandw %k1, %k0, %k0 8480; AVX512BW-NEXT: kshiftrq $41, %k5, %k1 8481; AVX512BW-NEXT: kmovq %k5, %k4 8482; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 8483; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 8484; AVX512BW-NEXT: korw %k6, %k0, %k0 8485; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8486; AVX512BW-NEXT: kandw %k3, %k0, %k0 8487; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 8488; AVX512BW-NEXT: korw %k6, %k0, %k0 8489; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8490; AVX512BW-NEXT: kandw %k3, %k0, %k0 8491; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 8492; AVX512BW-NEXT: korw %k6, %k0, %k0 8493; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8494; AVX512BW-NEXT: kandw %k5, %k0, %k0 8495; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 8496; AVX512BW-NEXT: korw %k6, %k0, %k0 8497; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8498; AVX512BW-NEXT: kandw %k5, %k0, %k0 8499; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 8500; AVX512BW-NEXT: korw %k6, %k0, %k0 8501; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8502; AVX512BW-NEXT: kandw %k5, %k0, %k0 8503; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 8504; AVX512BW-NEXT: korw %k1, %k0, %k0 8505; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8506; AVX512BW-NEXT: kandw %k1, %k0, %k0 8507; AVX512BW-NEXT: kshiftrq $42, %k4, %k1 8508; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 8509; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 8510; AVX512BW-NEXT: korw %k7, %k0, %k0 8511; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8512; AVX512BW-NEXT: kandw %k5, %k0, %k0 8513; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 8514; AVX512BW-NEXT: korw %k7, %k0, %k0 8515; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8516; AVX512BW-NEXT: kandw %k5, %k0, %k0 8517; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 8518; AVX512BW-NEXT: korw %k7, %k0, %k0 8519; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 8520; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 8521; AVX512BW-NEXT: korw %k6, %k0, %k7 8522; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k7} {z} 8523; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8524; AVX512BW-NEXT: kandw %k5, %k1, %k0 8525; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 8526; AVX512BW-NEXT: korw %k1, %k0, %k0 8527; AVX512BW-NEXT: kandw %k2, %k0, %k0 8528; AVX512BW-NEXT: kmovq %k4, %k7 8529; AVX512BW-NEXT: kshiftrq $43, %k4, %k1 8530; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 8531; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 8532; AVX512BW-NEXT: korw %k6, %k0, %k0 8533; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8534; AVX512BW-NEXT: kandw %k2, %k0, %k0 8535; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 8536; AVX512BW-NEXT: korw %k6, %k0, %k0 8537; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 8538; AVX512BW-NEXT: kandw %k4, %k0, %k0 8539; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 8540; AVX512BW-NEXT: korw %k6, %k0, %k0 8541; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8542; AVX512BW-NEXT: kandw %k2, %k0, %k0 8543; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 8544; AVX512BW-NEXT: korw %k6, %k0, %k0 8545; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8546; AVX512BW-NEXT: kandw %k6, %k0, %k0 8547; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 8548; AVX512BW-NEXT: korw %k6, %k0, %k0 8549; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8550; AVX512BW-NEXT: kandw %k6, %k0, %k0 8551; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 8552; AVX512BW-NEXT: korw %k1, %k0, %k0 8553; AVX512BW-NEXT: kandw %k3, %k0, %k0 8554; AVX512BW-NEXT: kshiftrq $44, %k7, %k1 8555; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 8556; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 8557; AVX512BW-NEXT: korw %k6, %k0, %k0 8558; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8559; AVX512BW-NEXT: kandw %k3, %k0, %k0 8560; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 8561; AVX512BW-NEXT: korw %k6, %k0, %k0 8562; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8563; AVX512BW-NEXT: kandw %k6, %k0, %k0 8564; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 8565; AVX512BW-NEXT: korw %k6, %k0, %k0 8566; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8567; AVX512BW-NEXT: kandw %k6, %k0, %k0 8568; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 8569; AVX512BW-NEXT: korw %k6, %k0, %k0 8570; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8571; AVX512BW-NEXT: kandw %k6, %k0, %k0 8572; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 8573; AVX512BW-NEXT: korw %k6, %k0, %k0 8574; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8575; AVX512BW-NEXT: kandw %k6, %k0, %k0 8576; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 8577; AVX512BW-NEXT: korw %k1, %k0, %k0 8578; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8579; AVX512BW-NEXT: kandw %k1, %k0, %k0 8580; AVX512BW-NEXT: kshiftrq $45, %k7, %k1 8581; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 8582; AVX512BW-NEXT: korw %k6, %k0, %k0 8583; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 8584; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 8585; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 8586; AVX512BW-NEXT: korw %k6, %k0, %k7 8587; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k7} {z} 8588; AVX512BW-NEXT: kandw %k5, %k1, %k0 8589; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 8590; AVX512BW-NEXT: korw %k1, %k0, %k0 8591; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8592; AVX512BW-NEXT: kandw %k1, %k0, %k0 8593; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 8594; AVX512BW-NEXT: korw %k1, %k0, %k0 8595; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8596; AVX512BW-NEXT: kandw %k1, %k0, %k0 8597; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 8598; AVX512BW-NEXT: korw %k1, %k0, %k0 8599; AVX512BW-NEXT: kandw %k4, %k0, %k0 8600; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload 8601; AVX512BW-NEXT: kshiftrq $46, %k5, %k1 8602; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 8603; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 8604; AVX512BW-NEXT: korw %k6, %k0, %k0 8605; AVX512BW-NEXT: kandw %k2, %k0, %k0 8606; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 8607; AVX512BW-NEXT: korw %k6, %k0, %k0 8608; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8609; AVX512BW-NEXT: kandw %k2, %k0, %k0 8610; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 8611; AVX512BW-NEXT: korw %k6, %k0, %k0 8612; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 8613; AVX512BW-NEXT: kandw %k4, %k0, %k0 8614; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 8615; AVX512BW-NEXT: korw %k6, %k0, %k0 8616; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 8617; AVX512BW-NEXT: kandw %k4, %k0, %k0 8618; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 8619; AVX512BW-NEXT: korw %k6, %k0, %k0 8620; AVX512BW-NEXT: kandw %k3, %k0, %k0 8621; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 8622; AVX512BW-NEXT: korw %k1, %k0, %k0 8623; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8624; AVX512BW-NEXT: kandw %k1, %k0, %k1 8625; AVX512BW-NEXT: kshiftrq $47, %k5, %k6 8626; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 8627; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 8628; AVX512BW-NEXT: korw %k7, %k1, %k1 8629; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 8630; AVX512BW-NEXT: kandw %k4, %k1, %k1 8631; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 8632; AVX512BW-NEXT: korw %k7, %k1, %k1 8633; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8634; AVX512BW-NEXT: kandw %k3, %k1, %k1 8635; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 8636; AVX512BW-NEXT: korw %k7, %k1, %k1 8637; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8638; AVX512BW-NEXT: kandw %k5, %k1, %k1 8639; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 8640; AVX512BW-NEXT: korw %k7, %k1, %k1 8641; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8642; AVX512BW-NEXT: kandw %k5, %k1, %k1 8643; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 8644; AVX512BW-NEXT: korw %k6, %k1, %k1 8645; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 8646; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 8647; AVX512BW-NEXT: korw %k0, %k1, %k1 8648; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k1} {z} 8649; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload 8650; AVX512BW-NEXT: kshiftrq $48, %k7, %k0 8651; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8652; AVX512BW-NEXT: kandw %k1, %k0, %k1 8653; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 8654; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 8655; AVX512BW-NEXT: korw %k6, %k1, %k1 8656; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8657; AVX512BW-NEXT: kandw %k6, %k1, %k1 8658; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 8659; AVX512BW-NEXT: korw %k6, %k1, %k1 8660; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8661; AVX512BW-NEXT: kandw %k6, %k1, %k1 8662; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 8663; AVX512BW-NEXT: korw %k6, %k1, %k1 8664; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8665; AVX512BW-NEXT: kandw %k6, %k1, %k1 8666; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 8667; AVX512BW-NEXT: korw %k6, %k1, %k1 8668; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8669; AVX512BW-NEXT: kandw %k6, %k1, %k1 8670; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 8671; AVX512BW-NEXT: korw %k0, %k1, %k0 8672; AVX512BW-NEXT: kandw %k2, %k0, %k0 8673; AVX512BW-NEXT: kshiftrq $49, %k7, %k1 8674; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 8675; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 8676; AVX512BW-NEXT: korw %k6, %k0, %k0 8677; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8678; AVX512BW-NEXT: kandw %k2, %k0, %k0 8679; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 8680; AVX512BW-NEXT: korw %k6, %k0, %k0 8681; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8682; AVX512BW-NEXT: kandw %k5, %k0, %k0 8683; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 8684; AVX512BW-NEXT: korw %k6, %k0, %k0 8685; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8686; AVX512BW-NEXT: kandw %k5, %k0, %k0 8687; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 8688; AVX512BW-NEXT: korw %k6, %k0, %k0 8689; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8690; AVX512BW-NEXT: kandw %k6, %k0, %k0 8691; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 8692; AVX512BW-NEXT: korw %k6, %k0, %k0 8693; AVX512BW-NEXT: kandw %k4, %k0, %k0 8694; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 8695; AVX512BW-NEXT: korw %k1, %k0, %k0 8696; AVX512BW-NEXT: kandw %k3, %k0, %k0 8697; AVX512BW-NEXT: kmovq %k7, %k5 8698; AVX512BW-NEXT: kshiftrq $50, %k7, %k1 8699; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 8700; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 8701; AVX512BW-NEXT: korw %k7, %k0, %k0 8702; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 8703; AVX512BW-NEXT: kandw %k4, %k0, %k0 8704; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 8705; AVX512BW-NEXT: korw %k7, %k0, %k0 8706; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8707; AVX512BW-NEXT: kandw %k3, %k0, %k0 8708; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 8709; AVX512BW-NEXT: korw %k7, %k0, %k0 8710; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 8711; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 8712; AVX512BW-NEXT: korw %k6, %k0, %k7 8713; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k7} {z} 8714; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 8715; AVX512BW-NEXT: kandw %k0, %k1, %k0 8716; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 8717; AVX512BW-NEXT: korw %k1, %k0, %k0 8718; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8719; AVX512BW-NEXT: kandw %k1, %k0, %k0 8720; AVX512BW-NEXT: kshiftrq $51, %k5, %k1 8721; AVX512BW-NEXT: kmovq %k5, %k7 8722; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 8723; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 8724; AVX512BW-NEXT: korw %k6, %k0, %k0 8725; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8726; AVX512BW-NEXT: kandw %k5, %k0, %k0 8727; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 8728; AVX512BW-NEXT: korw %k6, %k0, %k0 8729; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8730; AVX512BW-NEXT: kandw %k3, %k0, %k0 8731; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 8732; AVX512BW-NEXT: korw %k6, %k0, %k0 8733; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8734; AVX512BW-NEXT: kandw %k3, %k0, %k0 8735; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 8736; AVX512BW-NEXT: korw %k6, %k0, %k0 8737; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8738; AVX512BW-NEXT: kandw %k3, %k0, %k0 8739; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 8740; AVX512BW-NEXT: korw %k6, %k0, %k0 8741; AVX512BW-NEXT: kandw %k2, %k0, %k0 8742; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 8743; AVX512BW-NEXT: korw %k1, %k0, %k0 8744; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8745; AVX512BW-NEXT: kandw %k3, %k0, %k0 8746; AVX512BW-NEXT: kshiftrq $52, %k7, %k1 8747; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 8748; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 8749; AVX512BW-NEXT: korw %k6, %k0, %k0 8750; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8751; AVX512BW-NEXT: kandw %k2, %k0, %k0 8752; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 8753; AVX512BW-NEXT: korw %k6, %k0, %k0 8754; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8755; AVX512BW-NEXT: kandw %k2, %k0, %k0 8756; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 8757; AVX512BW-NEXT: korw %k6, %k0, %k0 8758; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8759; AVX512BW-NEXT: kandw %k2, %k0, %k0 8760; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 8761; AVX512BW-NEXT: korw %k6, %k0, %k0 8762; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8763; AVX512BW-NEXT: kandw %k2, %k0, %k0 8764; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 8765; AVX512BW-NEXT: korw %k6, %k0, %k0 8766; AVX512BW-NEXT: kandw %k4, %k0, %k0 8767; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 8768; AVX512BW-NEXT: korw %k1, %k0, %k0 8769; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8770; AVX512BW-NEXT: kandw %k2, %k0, %k0 8771; AVX512BW-NEXT: kshiftrq $53, %k7, %k1 8772; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 8773; AVX512BW-NEXT: korw %k6, %k0, %k0 8774; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 8775; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 8776; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 8777; AVX512BW-NEXT: korw %k6, %k0, %k7 8778; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k7} {z} 8779; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 8780; AVX512BW-NEXT: kandw %k0, %k1, %k0 8781; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 8782; AVX512BW-NEXT: korw %k1, %k0, %k0 8783; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8784; AVX512BW-NEXT: kandw %k1, %k0, %k0 8785; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 8786; AVX512BW-NEXT: korw %k1, %k0, %k0 8787; AVX512BW-NEXT: kandw %k5, %k0, %k0 8788; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 8789; AVX512BW-NEXT: korw %k1, %k0, %k0 8790; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 8791; AVX512BW-NEXT: kandw %k4, %k0, %k0 8792; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload 8793; AVX512BW-NEXT: kshiftrq $54, %k7, %k1 8794; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 8795; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 8796; AVX512BW-NEXT: korw %k6, %k0, %k0 8797; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8798; AVX512BW-NEXT: kandw %k6, %k0, %k0 8799; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 8800; AVX512BW-NEXT: korw %k6, %k0, %k0 8801; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8802; AVX512BW-NEXT: kandw %k5, %k0, %k0 8803; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 8804; AVX512BW-NEXT: korw %k6, %k0, %k0 8805; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8806; AVX512BW-NEXT: kandw %k5, %k0, %k0 8807; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 8808; AVX512BW-NEXT: korw %k6, %k0, %k0 8809; AVX512BW-NEXT: kandw %k3, %k0, %k0 8810; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 8811; AVX512BW-NEXT: korw %k6, %k0, %k0 8812; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8813; AVX512BW-NEXT: kandw %k3, %k0, %k0 8814; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 8815; AVX512BW-NEXT: korw %k1, %k0, %k0 8816; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8817; AVX512BW-NEXT: kandw %k3, %k0, %k1 8818; AVX512BW-NEXT: kshiftrq $55, %k7, %k6 8819; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 8820; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 8821; AVX512BW-NEXT: korw %k7, %k1, %k1 8822; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8823; AVX512BW-NEXT: kandw %k5, %k1, %k1 8824; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 8825; AVX512BW-NEXT: korw %k7, %k1, %k1 8826; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8827; AVX512BW-NEXT: kandw %k5, %k1, %k1 8828; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 8829; AVX512BW-NEXT: korw %k7, %k1, %k1 8830; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 8831; AVX512BW-NEXT: kandw %k5, %k1, %k1 8832; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 8833; AVX512BW-NEXT: korw %k7, %k1, %k1 8834; AVX512BW-NEXT: kandw %k2, %k1, %k1 8835; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 8836; AVX512BW-NEXT: korw %k6, %k1, %k1 8837; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 8838; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 8839; AVX512BW-NEXT: korw %k0, %k1, %k1 8840; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k1} {z} 8841; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload 8842; AVX512BW-NEXT: kshiftrq $56, %k5, %k0 8843; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8844; AVX512BW-NEXT: kandw %k2, %k0, %k1 8845; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 8846; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 8847; AVX512BW-NEXT: korw %k6, %k1, %k1 8848; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8849; AVX512BW-NEXT: kandw %k6, %k1, %k1 8850; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 8851; AVX512BW-NEXT: korw %k6, %k1, %k1 8852; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8853; AVX512BW-NEXT: kandw %k6, %k1, %k1 8854; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 8855; AVX512BW-NEXT: korw %k6, %k1, %k1 8856; AVX512BW-NEXT: kandw %k4, %k1, %k1 8857; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 8858; AVX512BW-NEXT: korw %k6, %k1, %k1 8859; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 8860; AVX512BW-NEXT: kandw %k4, %k1, %k1 8861; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 8862; AVX512BW-NEXT: korw %k0, %k1, %k0 8863; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8864; AVX512BW-NEXT: kandw %k1, %k0, %k0 8865; AVX512BW-NEXT: kshiftrq $57, %k5, %k1 8866; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 8867; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 8868; AVX512BW-NEXT: korw %k6, %k0, %k0 8869; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 8870; AVX512BW-NEXT: kandw %k4, %k0, %k0 8871; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 8872; AVX512BW-NEXT: korw %k6, %k0, %k0 8873; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 8874; AVX512BW-NEXT: kandw %k4, %k0, %k0 8875; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 8876; AVX512BW-NEXT: korw %k6, %k0, %k0 8877; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 8878; AVX512BW-NEXT: kandw %k4, %k0, %k0 8879; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 8880; AVX512BW-NEXT: korw %k6, %k0, %k0 8881; AVX512BW-NEXT: kandw %k3, %k0, %k0 8882; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 8883; AVX512BW-NEXT: korw %k6, %k0, %k0 8884; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8885; AVX512BW-NEXT: kandw %k3, %k0, %k0 8886; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 8887; AVX512BW-NEXT: korw %k1, %k0, %k0 8888; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8889; AVX512BW-NEXT: kandw %k3, %k0, %k0 8890; AVX512BW-NEXT: kshiftrq $58, %k5, %k1 8891; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 8892; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 8893; AVX512BW-NEXT: korw %k7, %k0, %k0 8894; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 8895; AVX512BW-NEXT: kandw %k7, %k0, %k0 8896; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 8897; AVX512BW-NEXT: korw %k7, %k0, %k0 8898; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 8899; AVX512BW-NEXT: kandw %k7, %k0, %k0 8900; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 8901; AVX512BW-NEXT: korw %k7, %k0, %k0 8902; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 8903; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 8904; AVX512BW-NEXT: korw %k6, %k0, %k7 8905; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm21 {%k7} {z} 8906; AVX512BW-NEXT: kandw %k2, %k1, %k0 8907; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 8908; AVX512BW-NEXT: korw %k1, %k0, %k0 8909; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8910; AVX512BW-NEXT: kandw %k1, %k0, %k0 8911; AVX512BW-NEXT: kshiftrq $59, %k5, %k1 8912; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 8913; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 8914; AVX512BW-NEXT: korw %k6, %k0, %k0 8915; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8916; AVX512BW-NEXT: kandw %k2, %k0, %k0 8917; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 8918; AVX512BW-NEXT: korw %k6, %k0, %k0 8919; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8920; AVX512BW-NEXT: kandw %k6, %k0, %k0 8921; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 8922; AVX512BW-NEXT: korw %k6, %k0, %k0 8923; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8924; AVX512BW-NEXT: kandw %k6, %k0, %k0 8925; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 8926; AVX512BW-NEXT: korw %k6, %k0, %k0 8927; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8928; AVX512BW-NEXT: kandw %k6, %k0, %k0 8929; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 8930; AVX512BW-NEXT: korw %k6, %k0, %k0 8931; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 8932; AVX512BW-NEXT: kandw %k6, %k0, %k0 8933; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 8934; AVX512BW-NEXT: korw %k1, %k0, %k0 8935; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8936; AVX512BW-NEXT: kandw %k1, %k0, %k0 8937; AVX512BW-NEXT: kshiftrq $60, %k5, %k1 8938; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 8939; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 8940; AVX512BW-NEXT: korw %k6, %k0, %k0 8941; AVX512BW-NEXT: kandw %k4, %k0, %k0 8942; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 8943; AVX512BW-NEXT: korw %k6, %k0, %k0 8944; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 8945; AVX512BW-NEXT: kandw %k4, %k0, %k0 8946; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 8947; AVX512BW-NEXT: korw %k6, %k0, %k0 8948; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 8949; AVX512BW-NEXT: kandw %k4, %k0, %k0 8950; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 8951; AVX512BW-NEXT: korw %k6, %k0, %k0 8952; AVX512BW-NEXT: kandw %k3, %k0, %k0 8953; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 8954; AVX512BW-NEXT: korw %k6, %k0, %k0 8955; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 8956; AVX512BW-NEXT: kandw %k3, %k0, %k0 8957; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 8958; AVX512BW-NEXT: korw %k1, %k0, %k0 8959; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8960; AVX512BW-NEXT: kandw %k1, %k0, %k0 8961; AVX512BW-NEXT: kshiftrq $61, %k5, %k1 8962; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 8963; AVX512BW-NEXT: korw %k6, %k0, %k0 8964; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 8965; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 8966; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 8967; AVX512BW-NEXT: korw %k6, %k0, %k7 8968; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm22 {%k7} {z} 8969; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 8970; AVX512BW-NEXT: kandw %k0, %k1, %k0 8971; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 8972; AVX512BW-NEXT: korw %k1, %k0, %k0 8973; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8974; AVX512BW-NEXT: kandw %k1, %k0, %k0 8975; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 8976; AVX512BW-NEXT: korw %k1, %k0, %k0 8977; AVX512BW-NEXT: kandw %k2, %k0, %k0 8978; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 8979; AVX512BW-NEXT: korw %k1, %k0, %k0 8980; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 8981; AVX512BW-NEXT: kandw %k1, %k0, %k0 8982; AVX512BW-NEXT: kshiftrq $62, %k5, %k1 8983; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 8984; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 8985; AVX512BW-NEXT: korw %k6, %k0, %k0 8986; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8987; AVX512BW-NEXT: kandw %k2, %k0, %k0 8988; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 8989; AVX512BW-NEXT: korw %k6, %k0, %k0 8990; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8991; AVX512BW-NEXT: kandw %k2, %k0, %k0 8992; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 8993; AVX512BW-NEXT: korw %k6, %k0, %k0 8994; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8995; AVX512BW-NEXT: kandw %k2, %k0, %k0 8996; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 8997; AVX512BW-NEXT: korw %k6, %k0, %k0 8998; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 8999; AVX512BW-NEXT: kandw %k2, %k0, %k0 9000; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 9001; AVX512BW-NEXT: korw %k6, %k0, %k0 9002; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 9003; AVX512BW-NEXT: kandw %k2, %k0, %k6 9004; AVX512BW-NEXT: kshiftrq $63, %k5, %k0 9005; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 9006; AVX512BW-NEXT: korw %k1, %k6, %k1 9007; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 9008; AVX512BW-NEXT: kandw %k2, %k1, %k2 9009; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 9010; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 9011; AVX512BW-NEXT: korw %k6, %k2, %k2 9012; AVX512BW-NEXT: kandw %k4, %k2, %k2 9013; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 9014; AVX512BW-NEXT: korw %k6, %k2, %k2 9015; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 9016; AVX512BW-NEXT: kandw %k4, %k2, %k2 9017; AVX512BW-NEXT: kshiftrw $3, %k1, %k5 9018; AVX512BW-NEXT: korw %k5, %k2, %k2 9019; AVX512BW-NEXT: kandw %k3, %k2, %k2 9020; AVX512BW-NEXT: kshiftrw $2, %k1, %k4 9021; AVX512BW-NEXT: korw %k4, %k2, %k2 9022; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 9023; AVX512BW-NEXT: kandw %k3, %k2, %k2 9024; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 9025; AVX512BW-NEXT: korw %k0, %k2, %k0 9026; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 9027; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 9028; AVX512BW-NEXT: korw %k1, %k0, %k1 9029; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z} 9030; AVX512BW-NEXT: vmovdqa64 %zmm23, 1472(%rdx) 9031; AVX512BW-NEXT: vmovdqa64 %zmm22, 1408(%rdx) 9032; AVX512BW-NEXT: vmovdqa64 %zmm21, 1344(%rdx) 9033; AVX512BW-NEXT: vmovdqa64 %zmm20, 1280(%rdx) 9034; AVX512BW-NEXT: vmovdqa64 %zmm19, 1216(%rdx) 9035; AVX512BW-NEXT: vmovdqa64 %zmm18, 1152(%rdx) 9036; AVX512BW-NEXT: vmovdqa64 %zmm17, 1088(%rdx) 9037; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx) 9038; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%rdx) 9039; AVX512BW-NEXT: vmovdqa64 %zmm14, 896(%rdx) 9040; AVX512BW-NEXT: vmovdqa64 %zmm13, 832(%rdx) 9041; AVX512BW-NEXT: vmovdqa64 %zmm12, 768(%rdx) 9042; AVX512BW-NEXT: vmovdqa64 %zmm11, 704(%rdx) 9043; AVX512BW-NEXT: vmovdqa64 %zmm10, 640(%rdx) 9044; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx) 9045; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rdx) 9046; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) 9047; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) 9048; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) 9049; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) 9050; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) 9051; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) 9052; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) 9053; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) 9054; AVX512BW-NEXT: vzeroupper 9055; AVX512BW-NEXT: retq 9056 %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 9057 %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <384 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63> 9058 %data = call <384 x i32> @llvm.masked.load.v384i32.p0(ptr %in.vec, i32 64, <384 x i1> %tgt.mask, <384 x i32> poison) 9059 store <384 x i32> %data, ptr %out.vec, align 64 9060 ret void 9061} 9062 9063define void @mask_replication_factor7_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 9064; AVX512F-ONLY-LABEL: mask_replication_factor7_vf2: 9065; AVX512F-ONLY: # %bb.0: 9066; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 9067; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 9068; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0] 9069; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 9070; AVX512F-ONLY-NEXT: vpslld $31, %zmm0, %zmm0 9071; AVX512F-ONLY-NEXT: movw $16383, %ax # imm = 0x3FFF 9072; AVX512F-ONLY-NEXT: kmovw %eax, %k1 9073; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} 9074; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 9075; AVX512F-ONLY-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx) 9076; AVX512F-ONLY-NEXT: vextracti32x4 $3, %zmm0, %xmm1 9077; AVX512F-ONLY-NEXT: vmovq %xmm1, 48(%rdx) 9078; AVX512F-ONLY-NEXT: vmovdqa %ymm0, (%rdx) 9079; AVX512F-ONLY-NEXT: vzeroupper 9080; AVX512F-ONLY-NEXT: retq 9081; 9082; AVX512DQ-LABEL: mask_replication_factor7_vf2: 9083; AVX512DQ: # %bb.0: 9084; AVX512DQ-NEXT: kmovw (%rdi), %k0 9085; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 9086; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0] 9087; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 9088; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 9089; AVX512DQ-NEXT: movw $16383, %ax # imm = 0x3FFF 9090; AVX512DQ-NEXT: kmovw %eax, %k1 9091; AVX512DQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 {%k1} 9092; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 9093; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx) 9094; AVX512DQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1 9095; AVX512DQ-NEXT: vmovq %xmm1, 48(%rdx) 9096; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) 9097; AVX512DQ-NEXT: vzeroupper 9098; AVX512DQ-NEXT: retq 9099; 9100; AVX512BW-LABEL: mask_replication_factor7_vf2: 9101; AVX512BW: # %bb.0: 9102; AVX512BW-NEXT: kmovw (%rdi), %k1 9103; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 9104; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0] 9105; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 9106; AVX512BW-NEXT: vpslld $31, %zmm0, %zmm0 9107; AVX512BW-NEXT: movw $16383, %ax # imm = 0x3FFF 9108; AVX512BW-NEXT: kmovd %eax, %k1 9109; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} 9110; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 9111; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx) 9112; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx) 9113; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm0 9114; AVX512BW-NEXT: vmovq %xmm0, 48(%rdx) 9115; AVX512BW-NEXT: vzeroupper 9116; AVX512BW-NEXT: retq 9117 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 9118 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1> 9119 %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <14 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 9120 %data = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr %in.vec, i32 64, <14 x i1> %tgt.mask, <14 x i32> poison) 9121 %data.padded = shufflevector <14 x i32> %data, <14 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 undef, i32 undef> 9122 store <14 x i32> %data, ptr %out.vec, align 64 9123 ret void 9124} 9125 9126define void @mask_replication_factor7_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 9127; AVX512F-ONLY-LABEL: mask_replication_factor7_vf4: 9128; AVX512F-ONLY: # %bb.0: 9129; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 9130; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 9131; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,0,0,0,0] 9132; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 9133; AVX512F-ONLY-NEXT: vpslld $31, %zmm1, %zmm1 9134; AVX512F-ONLY-NEXT: movw $4095, %ax # imm = 0xFFF 9135; AVX512F-ONLY-NEXT: kmovw %eax, %k1 9136; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} 9137; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] 9138; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 9139; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 9140; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} 9141; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} 9142; AVX512F-ONLY-NEXT: vextracti32x4 $2, %zmm0, 96(%rdx) 9143; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) 9144; AVX512F-ONLY-NEXT: vmovdqa %ymm0, 64(%rdx) 9145; AVX512F-ONLY-NEXT: vzeroupper 9146; AVX512F-ONLY-NEXT: retq 9147; 9148; AVX512DQ-LABEL: mask_replication_factor7_vf4: 9149; AVX512DQ: # %bb.0: 9150; AVX512DQ-NEXT: kmovw (%rdi), %k0 9151; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 9152; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,0,0,0,0] 9153; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 9154; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 9155; AVX512DQ-NEXT: movw $4095, %ax # imm = 0xFFF 9156; AVX512DQ-NEXT: kmovw %eax, %k1 9157; AVX512DQ-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1} 9158; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] 9159; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 9160; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 9161; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} 9162; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} 9163; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, 96(%rdx) 9164; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) 9165; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rdx) 9166; AVX512DQ-NEXT: vzeroupper 9167; AVX512DQ-NEXT: retq 9168; 9169; AVX512BW-LABEL: mask_replication_factor7_vf4: 9170; AVX512BW: # %bb.0: 9171; AVX512BW-NEXT: kmovd (%rdi), %k0 9172; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 9173; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,0,0,0,0] 9174; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 9175; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 9176; AVX512BW-NEXT: movl $268435455, %eax # imm = 0xFFFFFFF 9177; AVX512BW-NEXT: kmovd %eax, %k1 9178; AVX512BW-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 {%k1} 9179; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 9180; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 9181; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} 9182; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, 96(%rdx) 9183; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) 9184; AVX512BW-NEXT: vmovdqa %ymm1, 64(%rdx) 9185; AVX512BW-NEXT: vzeroupper 9186; AVX512BW-NEXT: retq 9187 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 9188 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 9189 %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 9190 %data = call <28 x i32> @llvm.masked.load.v28i32.p0(ptr %in.vec, i32 64, <28 x i1> %tgt.mask, <28 x i32> poison) 9191 %data.padded = shufflevector <28 x i32> %data, <28 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 undef, i32 undef, i32 undef, i32 undef> 9192 store <28 x i32> %data, ptr %out.vec, align 64 9193 ret void 9194} 9195 9196define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 9197; AVX512F-SLOW-LABEL: mask_replication_factor7_vf8: 9198; AVX512F-SLOW: # %bb.0: 9199; AVX512F-SLOW-NEXT: kmovw (%rdi), %k1 9200; AVX512F-SLOW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 9201; AVX512F-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] 9202; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 9203; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k2 9204; AVX512F-SLOW-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1 9205; AVX512F-SLOW-NEXT: movw $1, %ax 9206; AVX512F-SLOW-NEXT: kmovw %eax, %k2 9207; AVX512F-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} 9208; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k2 9209; AVX512F-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] 9210; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 9211; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k3 9212; AVX512F-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] 9213; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 9214; AVX512F-SLOW-NEXT: vptestmd %zmm0, %zmm0, %k4 9215; AVX512F-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 9216; AVX512F-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 9217; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,3,3,6,7,7,7] 9218; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,3] 9219; AVX512F-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1 9220; AVX512F-SLOW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} 9221; AVX512F-SLOW-NEXT: vmovdqa32 192(%rsi), %zmm1 {%k1} {z} 9222; AVX512F-SLOW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z} 9223; AVX512F-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k3} {z} 9224; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) 9225; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rdx) 9226; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) 9227; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 192(%rdx) 9228; AVX512F-SLOW-NEXT: vzeroupper 9229; AVX512F-SLOW-NEXT: retq 9230; 9231; AVX512F-FAST-LABEL: mask_replication_factor7_vf8: 9232; AVX512F-FAST: # %bb.0: 9233; AVX512F-FAST-NEXT: kmovw (%rdi), %k1 9234; AVX512F-FAST-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 9235; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] 9236; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 9237; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k2 9238; AVX512F-FAST-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1 9239; AVX512F-FAST-NEXT: movw $1, %ax 9240; AVX512F-FAST-NEXT: kmovw %eax, %k2 9241; AVX512F-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} 9242; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k2 9243; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] 9244; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 9245; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k3 9246; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] 9247; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 9248; AVX512F-FAST-NEXT: vptestmd %zmm0, %zmm0, %k4 9249; AVX512F-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 9250; AVX512F-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 9251; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,7,7,7,7,7,7,7] 9252; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 9253; AVX512F-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1 9254; AVX512F-FAST-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} 9255; AVX512F-FAST-NEXT: vmovdqa32 192(%rsi), %zmm1 {%k1} {z} 9256; AVX512F-FAST-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z} 9257; AVX512F-FAST-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k3} {z} 9258; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) 9259; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 128(%rdx) 9260; AVX512F-FAST-NEXT: vmovdqa %ymm1, 192(%rdx) 9261; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) 9262; AVX512F-FAST-NEXT: vzeroupper 9263; AVX512F-FAST-NEXT: retq 9264; 9265; AVX512DQ-SLOW-LABEL: mask_replication_factor7_vf8: 9266; AVX512DQ-SLOW: # %bb.0: 9267; AVX512DQ-SLOW-NEXT: kmovb (%rdi), %k0 9268; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %zmm0 9269; AVX512DQ-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] 9270; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 9271; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k1 9272; AVX512DQ-SLOW-NEXT: vpmovm2d %k1, %zmm1 9273; AVX512DQ-SLOW-NEXT: movw $1, %ax 9274; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 9275; AVX512DQ-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 9276; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k1 9277; AVX512DQ-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] 9278; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 9279; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k2 9280; AVX512DQ-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] 9281; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 9282; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm0, %k3 9283; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %ymm0 9284; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,3,3,6,7,7,7] 9285; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,3] 9286; AVX512DQ-SLOW-NEXT: vpmovd2m %ymm0, %k4 9287; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 9288; AVX512DQ-SLOW-NEXT: vmovdqa32 192(%rsi), %zmm1 {%k4} {z} 9289; AVX512DQ-SLOW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} 9290; AVX512DQ-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k2} {z} 9291; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) 9292; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rdx) 9293; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, 192(%rdx) 9294; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) 9295; AVX512DQ-SLOW-NEXT: vzeroupper 9296; AVX512DQ-SLOW-NEXT: retq 9297; 9298; AVX512DQ-FAST-LABEL: mask_replication_factor7_vf8: 9299; AVX512DQ-FAST: # %bb.0: 9300; AVX512DQ-FAST-NEXT: kmovb (%rdi), %k0 9301; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %zmm0 9302; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] 9303; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 9304; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k1 9305; AVX512DQ-FAST-NEXT: vpmovm2d %k1, %zmm1 9306; AVX512DQ-FAST-NEXT: movw $1, %ax 9307; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 9308; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 9309; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k1 9310; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] 9311; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 9312; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k2 9313; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] 9314; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 9315; AVX512DQ-FAST-NEXT: vpmovd2m %zmm0, %k3 9316; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %ymm0 9317; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,7,7,7,7,7,7,7] 9318; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 9319; AVX512DQ-FAST-NEXT: vpmovd2m %ymm0, %k4 9320; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 9321; AVX512DQ-FAST-NEXT: vmovdqa32 192(%rsi), %zmm1 {%k4} {z} 9322; AVX512DQ-FAST-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} 9323; AVX512DQ-FAST-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k2} {z} 9324; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) 9325; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 128(%rdx) 9326; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, 192(%rdx) 9327; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) 9328; AVX512DQ-FAST-NEXT: vzeroupper 9329; AVX512DQ-FAST-NEXT: retq 9330; 9331; AVX512BW-ONLY-LABEL: mask_replication_factor7_vf8: 9332; AVX512BW-ONLY: # %bb.0: 9333; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k0 9334; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 9335; AVX512BW-ONLY-NEXT: vpbroadcastq %xmm0, %zmm0 9336; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,18,18,18,18,18,19,19,19,19,19,19,19,20,20,20,20,36,36,36,37,37,37,37,37,37,37,38,38,38,38,38,38,54,55,55,55,55,55,55,55,u,u,u,u,u,u,u,u] 9337; AVX512BW-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1 9338; AVX512BW-ONLY-NEXT: movabsq $72057594037927935, %rax # imm = 0xFFFFFFFFFFFFFF 9339; AVX512BW-ONLY-NEXT: kmovq %rax, %k1 9340; AVX512BW-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1} 9341; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 9342; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k2} {z} 9343; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} 9344; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k2 9345; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} 9346; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k1 9347; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k1} {z} 9348; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 64(%rdx) 9349; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) 9350; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) 9351; AVX512BW-ONLY-NEXT: vmovdqa %ymm0, 192(%rdx) 9352; AVX512BW-ONLY-NEXT: vzeroupper 9353; AVX512BW-ONLY-NEXT: retq 9354; 9355; AVX512VBMI-ONLY-LABEL: mask_replication_factor7_vf8: 9356; AVX512VBMI-ONLY: # %bb.0: 9357; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k0 9358; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 9359; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4,4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6,6,7,7,7,7,7,7,7,u,u,u,u,u,u,u,u] 9360; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 9361; AVX512VBMI-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1 9362; AVX512VBMI-ONLY-NEXT: movabsq $72057594037927935, %rax # imm = 0xFFFFFFFFFFFFFF 9363; AVX512VBMI-ONLY-NEXT: kmovq %rax, %k1 9364; AVX512VBMI-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1} 9365; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 9366; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k2} {z} 9367; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} 9368; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k2 9369; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} 9370; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k1 9371; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k1} {z} 9372; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 64(%rdx) 9373; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) 9374; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) 9375; AVX512VBMI-ONLY-NEXT: vmovdqa %ymm0, 192(%rdx) 9376; AVX512VBMI-ONLY-NEXT: vzeroupper 9377; AVX512VBMI-ONLY-NEXT: retq 9378 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 9379 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 9380 %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <56 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 9381 %data = call <56 x i32> @llvm.masked.load.v56i32.p0(ptr %in.vec, i32 64, <56 x i1> %tgt.mask, <56 x i32> poison) 9382 %data.padded = shufflevector <56 x i32> %data, <56 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 9383 store <56 x i32> %data, ptr %out.vec, align 64 9384 ret void 9385} 9386 9387define void @mask_replication_factor7_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 9388; AVX512F-ONLY-LABEL: mask_replication_factor7_vf16: 9389; AVX512F-ONLY: # %bb.0: 9390; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 9391; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 9392; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] 9393; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 9394; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 9395; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 9396; AVX512F-ONLY-NEXT: movw $1, %ax 9397; AVX512F-ONLY-NEXT: kmovw %eax, %k1 9398; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 9399; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 9400; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] 9401; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 9402; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 9403; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] 9404; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 9405; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 9406; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] 9407; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 9408; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 9409; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] 9410; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 9411; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 9412; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] 9413; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 9414; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k6 9415; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] 9416; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 9417; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k7 9418; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} 9419; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k7} {z} 9420; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k6} {z} 9421; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k5} {z} 9422; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm4 {%k4} {z} 9423; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k3} {z} 9424; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm6 {%k1} {z} 9425; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 64(%rdx) 9426; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 128(%rdx) 9427; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 192(%rdx) 9428; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 256(%rdx) 9429; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 320(%rdx) 9430; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 384(%rdx) 9431; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) 9432; AVX512F-ONLY-NEXT: vzeroupper 9433; AVX512F-ONLY-NEXT: retq 9434; 9435; AVX512DQ-LABEL: mask_replication_factor7_vf16: 9436; AVX512DQ: # %bb.0: 9437; AVX512DQ-NEXT: kmovw (%rdi), %k0 9438; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 9439; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] 9440; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 9441; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 9442; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 9443; AVX512DQ-NEXT: movw $1, %ax 9444; AVX512DQ-NEXT: kmovw %eax, %k1 9445; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 9446; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 9447; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] 9448; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 9449; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 9450; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] 9451; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 9452; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 9453; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] 9454; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 9455; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 9456; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] 9457; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 9458; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 9459; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] 9460; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 9461; AVX512DQ-NEXT: vpmovd2m %zmm1, %k6 9462; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] 9463; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 9464; AVX512DQ-NEXT: vpmovd2m %zmm0, %k7 9465; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} 9466; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k7} {z} 9467; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k6} {z} 9468; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k5} {z} 9469; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm4 {%k4} {z} 9470; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k3} {z} 9471; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm6 {%k1} {z} 9472; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rdx) 9473; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%rdx) 9474; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%rdx) 9475; AVX512DQ-NEXT: vmovdqa64 %zmm3, 256(%rdx) 9476; AVX512DQ-NEXT: vmovdqa64 %zmm2, 320(%rdx) 9477; AVX512DQ-NEXT: vmovdqa64 %zmm1, 384(%rdx) 9478; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) 9479; AVX512DQ-NEXT: vzeroupper 9480; AVX512DQ-NEXT: retq 9481; 9482; AVX512BW-LABEL: mask_replication_factor7_vf16: 9483; AVX512BW: # %bb.0: 9484; AVX512BW-NEXT: kmovw (%rdi), %k1 9485; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 9486; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] 9487; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 9488; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 9489; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} 9490; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] 9491; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 9492; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 9493; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z} 9494; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] 9495; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm3 9496; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1 9497; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} 9498; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] 9499; AVX512BW-NEXT: vpermd %zmm0, %zmm4, %zmm4 9500; AVX512BW-NEXT: vptestmd %zmm4, %zmm4, %k1 9501; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} 9502; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] 9503; AVX512BW-NEXT: vpermd %zmm0, %zmm5, %zmm5 9504; AVX512BW-NEXT: vptestmd %zmm5, %zmm5, %k1 9505; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm5 {%k1} {z} 9506; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] 9507; AVX512BW-NEXT: vpermd %zmm0, %zmm6, %zmm6 9508; AVX512BW-NEXT: vptestmd %zmm6, %zmm6, %k1 9509; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} 9510; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm7 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] 9511; AVX512BW-NEXT: vpermd %zmm0, %zmm7, %zmm0 9512; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 9513; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} 9514; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) 9515; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rdx) 9516; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rdx) 9517; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) 9518; AVX512BW-NEXT: vmovdqa64 %zmm3, 320(%rdx) 9519; AVX512BW-NEXT: vmovdqa64 %zmm2, 384(%rdx) 9520; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) 9521; AVX512BW-NEXT: vzeroupper 9522; AVX512BW-NEXT: retq 9523 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 9524 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 9525 %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <112 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 9526 %data = call <112 x i32> @llvm.masked.load.v112i32.p0(ptr %in.vec, i32 64, <112 x i1> %tgt.mask, <112 x i32> poison) 9527 store <112 x i32> %data, ptr %out.vec, align 64 9528 ret void 9529} 9530 9531define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 9532; AVX512F-ONLY-LABEL: mask_replication_factor7_vf32: 9533; AVX512F-ONLY: # %bb.0: 9534; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 9535; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 9536; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] 9537; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 9538; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 9539; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1 9540; AVX512F-ONLY-NEXT: movw $1, %ax 9541; AVX512F-ONLY-NEXT: kmovw %eax, %k1 9542; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} 9543; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 9544; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 9545; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 9546; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] 9547; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm4 9548; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] 9549; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 9550; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] 9551; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 9552; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] 9553; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10 9554; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] 9555; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm12 9556; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] 9557; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm0 9558; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm1 9559; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm2 9560; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm5, %zmm5 9561; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm7 9562; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm9 9563; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm11 9564; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm13, %zmm3 9565; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm13 {%k1} {z} 9566; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 9567; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm3 {%k1} {z} 9568; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 9569; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm11 {%k1} {z} 9570; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 9571; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm9 {%k1} {z} 9572; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 9573; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm7 {%k1} {z} 9574; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 9575; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm5 {%k1} {z} 9576; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 9577; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z} 9578; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 9579; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z} 9580; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 9581; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm0 {%k1} {z} 9582; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 9583; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm12 {%k1} {z} 9584; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 9585; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} 9586; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 9587; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} 9588; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 9589; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} 9590; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 9591; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} 9592; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx) 9593; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 128(%rdx) 9594; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx) 9595; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 256(%rdx) 9596; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 320(%rdx) 9597; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 384(%rdx) 9598; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 448(%rdx) 9599; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 512(%rdx) 9600; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 576(%rdx) 9601; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 640(%rdx) 9602; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 704(%rdx) 9603; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 768(%rdx) 9604; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 832(%rdx) 9605; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, (%rdx) 9606; AVX512F-ONLY-NEXT: vzeroupper 9607; AVX512F-ONLY-NEXT: retq 9608; 9609; AVX512DQ-LABEL: mask_replication_factor7_vf32: 9610; AVX512DQ: # %bb.0: 9611; AVX512DQ-NEXT: kmovw (%rdi), %k0 9612; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 9613; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] 9614; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 9615; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 9616; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 9617; AVX512DQ-NEXT: movw $1, %ax 9618; AVX512DQ-NEXT: kmovw %eax, %k1 9619; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} 9620; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 9621; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 9622; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 9623; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] 9624; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm4 9625; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] 9626; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 9627; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] 9628; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 9629; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] 9630; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10 9631; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] 9632; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm12 9633; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] 9634; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm0 9635; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm1 9636; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm2 9637; AVX512DQ-NEXT: vpermd %zmm3, %zmm5, %zmm5 9638; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm7 9639; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm9 9640; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm11 9641; AVX512DQ-NEXT: vpermd %zmm3, %zmm13, %zmm3 9642; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm13 {%k1} {z} 9643; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 9644; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm3 {%k1} {z} 9645; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 9646; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm11 {%k1} {z} 9647; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 9648; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm9 {%k1} {z} 9649; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 9650; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm7 {%k1} {z} 9651; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 9652; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm5 {%k1} {z} 9653; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 9654; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z} 9655; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 9656; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z} 9657; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 9658; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm0 {%k1} {z} 9659; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 9660; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm12 {%k1} {z} 9661; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 9662; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} 9663; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 9664; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} 9665; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 9666; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} 9667; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 9668; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} 9669; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx) 9670; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rdx) 9671; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx) 9672; AVX512DQ-NEXT: vmovdqa64 %zmm10, 256(%rdx) 9673; AVX512DQ-NEXT: vmovdqa64 %zmm12, 320(%rdx) 9674; AVX512DQ-NEXT: vmovdqa64 %zmm0, 384(%rdx) 9675; AVX512DQ-NEXT: vmovdqa64 %zmm1, 448(%rdx) 9676; AVX512DQ-NEXT: vmovdqa64 %zmm2, 512(%rdx) 9677; AVX512DQ-NEXT: vmovdqa64 %zmm5, 576(%rdx) 9678; AVX512DQ-NEXT: vmovdqa64 %zmm7, 640(%rdx) 9679; AVX512DQ-NEXT: vmovdqa64 %zmm9, 704(%rdx) 9680; AVX512DQ-NEXT: vmovdqa64 %zmm11, 768(%rdx) 9681; AVX512DQ-NEXT: vmovdqa64 %zmm3, 832(%rdx) 9682; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%rdx) 9683; AVX512DQ-NEXT: vzeroupper 9684; AVX512DQ-NEXT: retq 9685; 9686; AVX512BW-LABEL: mask_replication_factor7_vf32: 9687; AVX512BW: # %bb.0: 9688; AVX512BW-NEXT: movw $-3, %ax 9689; AVX512BW-NEXT: kmovd %eax, %k2 9690; AVX512BW-NEXT: kmovw (%rdi), %k0 9691; AVX512BW-NEXT: kandw %k2, %k0, %k1 9692; AVX512BW-NEXT: kmovq %k2, %k3 9693; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 9694; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 9695; AVX512BW-NEXT: kshiftrw $14, %k0, %k2 9696; AVX512BW-NEXT: korw %k2, %k1, %k1 9697; AVX512BW-NEXT: movw $-5, %ax 9698; AVX512BW-NEXT: kmovd %eax, %k2 9699; AVX512BW-NEXT: kandw %k2, %k1, %k1 9700; AVX512BW-NEXT: kmovq %k2, %k4 9701; AVX512BW-NEXT: kshiftrw $13, %k0, %k2 9702; AVX512BW-NEXT: korw %k2, %k1, %k1 9703; AVX512BW-NEXT: movw $-9, %ax 9704; AVX512BW-NEXT: kmovd %eax, %k2 9705; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 9706; AVX512BW-NEXT: kandw %k2, %k1, %k1 9707; AVX512BW-NEXT: kshiftrw $12, %k0, %k2 9708; AVX512BW-NEXT: korw %k2, %k1, %k1 9709; AVX512BW-NEXT: movw $-17, %ax 9710; AVX512BW-NEXT: kmovd %eax, %k2 9711; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 9712; AVX512BW-NEXT: kandw %k2, %k1, %k1 9713; AVX512BW-NEXT: kshiftrw $11, %k0, %k2 9714; AVX512BW-NEXT: korw %k2, %k1, %k1 9715; AVX512BW-NEXT: movw $-33, %ax 9716; AVX512BW-NEXT: kmovd %eax, %k2 9717; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 9718; AVX512BW-NEXT: kandw %k2, %k1, %k1 9719; AVX512BW-NEXT: kshiftrw $10, %k0, %k2 9720; AVX512BW-NEXT: korw %k2, %k1, %k1 9721; AVX512BW-NEXT: movw $-65, %ax 9722; AVX512BW-NEXT: kmovd %eax, %k2 9723; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 9724; AVX512BW-NEXT: kandw %k2, %k1, %k1 9725; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 9726; AVX512BW-NEXT: korw %k0, %k1, %k0 9727; AVX512BW-NEXT: movw $-129, %ax 9728; AVX512BW-NEXT: kmovd %eax, %k1 9729; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 9730; AVX512BW-NEXT: kandw %k1, %k0, %k1 9731; AVX512BW-NEXT: kmovd (%rdi), %k6 9732; AVX512BW-NEXT: kshiftrd $1, %k6, %k0 9733; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 9734; AVX512BW-NEXT: kshiftrw $8, %k0, %k2 9735; AVX512BW-NEXT: korw %k2, %k1, %k1 9736; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF 9737; AVX512BW-NEXT: kmovd %eax, %k2 9738; AVX512BW-NEXT: kandw %k2, %k1, %k1 9739; AVX512BW-NEXT: kmovq %k2, %k7 9740; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 9741; AVX512BW-NEXT: kshiftrw $7, %k0, %k2 9742; AVX512BW-NEXT: korw %k2, %k1, %k1 9743; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF 9744; AVX512BW-NEXT: kmovd %eax, %k5 9745; AVX512BW-NEXT: kandw %k5, %k1, %k1 9746; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 9747; AVX512BW-NEXT: kshiftrw $6, %k0, %k2 9748; AVX512BW-NEXT: korw %k2, %k1, %k1 9749; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF 9750; AVX512BW-NEXT: kmovd %eax, %k2 9751; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 9752; AVX512BW-NEXT: kandw %k2, %k1, %k1 9753; AVX512BW-NEXT: kshiftrw $5, %k0, %k2 9754; AVX512BW-NEXT: korw %k2, %k1, %k1 9755; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF 9756; AVX512BW-NEXT: kmovd %eax, %k2 9757; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 9758; AVX512BW-NEXT: kandw %k2, %k1, %k1 9759; AVX512BW-NEXT: kshiftrw $4, %k0, %k2 9760; AVX512BW-NEXT: korw %k2, %k1, %k1 9761; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF 9762; AVX512BW-NEXT: kmovd %eax, %k2 9763; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 9764; AVX512BW-NEXT: kandw %k2, %k1, %k1 9765; AVX512BW-NEXT: kshiftrw $3, %k0, %k2 9766; AVX512BW-NEXT: korw %k2, %k1, %k1 9767; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF 9768; AVX512BW-NEXT: kmovd %eax, %k2 9769; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 9770; AVX512BW-NEXT: kandw %k2, %k1, %k1 9771; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 9772; AVX512BW-NEXT: korw %k0, %k1, %k0 9773; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF 9774; AVX512BW-NEXT: kmovd %eax, %k1 9775; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 9776; AVX512BW-NEXT: kandw %k1, %k0, %k0 9777; AVX512BW-NEXT: kshiftrd $2, %k6, %k2 9778; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 9779; AVX512BW-NEXT: kshiftlw $14, %k2, %k1 9780; AVX512BW-NEXT: korw %k1, %k0, %k0 9781; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 9782; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 9783; AVX512BW-NEXT: kshiftlw $15, %k2, %k1 9784; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 9785; AVX512BW-NEXT: korw %k1, %k0, %k1 9786; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 9787; AVX512BW-NEXT: kmovq %k6, %k2 9788; AVX512BW-NEXT: kshiftrd $29, %k6, %k1 9789; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 9790; AVX512BW-NEXT: kandw %k3, %k1, %k0 9791; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 9792; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 9793; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 9794; AVX512BW-NEXT: korw %k1, %k0, %k0 9795; AVX512BW-NEXT: kmovq %k4, %k6 9796; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 9797; AVX512BW-NEXT: kandw %k4, %k0, %k0 9798; AVX512BW-NEXT: kshiftrd $30, %k2, %k1 9799; AVX512BW-NEXT: kmovq %k2, %k4 9800; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 9801; AVX512BW-NEXT: kshiftrw $13, %k1, %k3 9802; AVX512BW-NEXT: korw %k3, %k0, %k0 9803; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 9804; AVX512BW-NEXT: kandw %k2, %k0, %k0 9805; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 9806; AVX512BW-NEXT: korw %k3, %k0, %k0 9807; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 9808; AVX512BW-NEXT: kandw %k2, %k0, %k0 9809; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 9810; AVX512BW-NEXT: korw %k3, %k0, %k0 9811; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 9812; AVX512BW-NEXT: kandw %k2, %k0, %k0 9813; AVX512BW-NEXT: kshiftrw $10, %k1, %k3 9814; AVX512BW-NEXT: korw %k3, %k0, %k0 9815; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 9816; AVX512BW-NEXT: kandw %k2, %k0, %k0 9817; AVX512BW-NEXT: kshiftrw $9, %k1, %k3 9818; AVX512BW-NEXT: korw %k3, %k0, %k0 9819; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 9820; AVX512BW-NEXT: kandw %k2, %k0, %k0 9821; AVX512BW-NEXT: kshiftrw $8, %k1, %k3 9822; AVX512BW-NEXT: korw %k3, %k0, %k0 9823; AVX512BW-NEXT: kandw %k7, %k0, %k0 9824; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 9825; AVX512BW-NEXT: korw %k1, %k0, %k0 9826; AVX512BW-NEXT: kandw %k5, %k0, %k3 9827; AVX512BW-NEXT: kshiftrd $31, %k4, %k0 9828; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 9829; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 9830; AVX512BW-NEXT: kshiftrw $6, %k1, %k7 9831; AVX512BW-NEXT: korw %k7, %k3, %k3 9832; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 9833; AVX512BW-NEXT: kandw %k5, %k3, %k3 9834; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 9835; AVX512BW-NEXT: korw %k7, %k3, %k3 9836; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 9837; AVX512BW-NEXT: kandw %k2, %k3, %k3 9838; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 9839; AVX512BW-NEXT: korw %k7, %k3, %k3 9840; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 9841; AVX512BW-NEXT: kandw %k2, %k3, %k3 9842; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 9843; AVX512BW-NEXT: korw %k7, %k3, %k3 9844; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 9845; AVX512BW-NEXT: kandw %k7, %k3, %k3 9846; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 9847; AVX512BW-NEXT: korw %k7, %k3, %k3 9848; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 9849; AVX512BW-NEXT: kandw %k7, %k3, %k3 9850; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 9851; AVX512BW-NEXT: korw %k0, %k3, %k0 9852; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 9853; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 9854; AVX512BW-NEXT: korw %k1, %k0, %k1 9855; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm1 {%k1} {z} 9856; AVX512BW-NEXT: kshiftrd $27, %k4, %k1 9857; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 9858; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 9859; AVX512BW-NEXT: kandw %k0, %k1, %k0 9860; AVX512BW-NEXT: kshiftlw $15, %k1, %k3 9861; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 9862; AVX512BW-NEXT: kshiftrw $14, %k3, %k7 9863; AVX512BW-NEXT: korw %k7, %k0, %k0 9864; AVX512BW-NEXT: kandw %k6, %k0, %k0 9865; AVX512BW-NEXT: kshiftrw $13, %k3, %k7 9866; AVX512BW-NEXT: korw %k7, %k0, %k0 9867; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 9868; AVX512BW-NEXT: kandw %k1, %k0, %k0 9869; AVX512BW-NEXT: kshiftrw $12, %k3, %k7 9870; AVX512BW-NEXT: korw %k7, %k0, %k0 9871; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 9872; AVX512BW-NEXT: kandw %k3, %k0, %k7 9873; AVX512BW-NEXT: kshiftrd $28, %k4, %k0 9874; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 9875; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 9876; AVX512BW-NEXT: korw %k6, %k7, %k6 9877; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 9878; AVX512BW-NEXT: kandw %k1, %k6, %k6 9879; AVX512BW-NEXT: kshiftrw $10, %k0, %k7 9880; AVX512BW-NEXT: korw %k7, %k6, %k6 9881; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 9882; AVX512BW-NEXT: kandw %k1, %k6, %k6 9883; AVX512BW-NEXT: kshiftrw $9, %k0, %k7 9884; AVX512BW-NEXT: korw %k7, %k6, %k6 9885; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 9886; AVX512BW-NEXT: kandw %k1, %k6, %k6 9887; AVX512BW-NEXT: kshiftrw $8, %k0, %k7 9888; AVX512BW-NEXT: korw %k7, %k6, %k6 9889; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 9890; AVX512BW-NEXT: kandw %k1, %k6, %k6 9891; AVX512BW-NEXT: kshiftrw $7, %k0, %k7 9892; AVX512BW-NEXT: korw %k7, %k6, %k6 9893; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 9894; AVX512BW-NEXT: kandw %k1, %k6, %k6 9895; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 9896; AVX512BW-NEXT: korw %k7, %k6, %k6 9897; AVX512BW-NEXT: kandw %k5, %k6, %k6 9898; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 9899; AVX512BW-NEXT: korw %k0, %k6, %k0 9900; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 9901; AVX512BW-NEXT: kandw %k4, %k0, %k0 9902; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 9903; AVX512BW-NEXT: kshiftrw $4, %k7, %k6 9904; AVX512BW-NEXT: korw %k6, %k0, %k0 9905; AVX512BW-NEXT: kandw %k2, %k0, %k0 9906; AVX512BW-NEXT: kmovq %k2, %k4 9907; AVX512BW-NEXT: kshiftrw $3, %k7, %k6 9908; AVX512BW-NEXT: korw %k6, %k0, %k0 9909; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 9910; AVX512BW-NEXT: kandw %k2, %k0, %k0 9911; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 9912; AVX512BW-NEXT: korw %k6, %k0, %k0 9913; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 9914; AVX512BW-NEXT: kandw %k2, %k0, %k0 9915; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload 9916; AVX512BW-NEXT: kshiftlw $14, %k2, %k5 9917; AVX512BW-NEXT: korw %k5, %k0, %k0 9918; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 9919; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 9920; AVX512BW-NEXT: korw %k7, %k0, %k2 9921; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm2 {%k2} {z} 9922; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload 9923; AVX512BW-NEXT: kshiftrd $25, %k6, %k0 9924; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 9925; AVX512BW-NEXT: kandw %k2, %k0, %k2 9926; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 9927; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 9928; AVX512BW-NEXT: korw %k5, %k2, %k2 9929; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 9930; AVX512BW-NEXT: kandw %k5, %k2, %k2 9931; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 9932; AVX512BW-NEXT: korw %k5, %k2, %k2 9933; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 9934; AVX512BW-NEXT: kandw %k5, %k2, %k2 9935; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 9936; AVX512BW-NEXT: korw %k5, %k2, %k2 9937; AVX512BW-NEXT: kandw %k3, %k2, %k2 9938; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 9939; AVX512BW-NEXT: korw %k5, %k2, %k2 9940; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 9941; AVX512BW-NEXT: kandw %k7, %k2, %k2 9942; AVX512BW-NEXT: kshiftrw $10, %k0, %k5 9943; AVX512BW-NEXT: korw %k5, %k2, %k2 9944; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 9945; AVX512BW-NEXT: kandw %k3, %k2, %k5 9946; AVX512BW-NEXT: kshiftrd $26, %k6, %k2 9947; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 9948; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 9949; AVX512BW-NEXT: korw %k6, %k5, %k5 9950; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 9951; AVX512BW-NEXT: kandw %k3, %k5, %k5 9952; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 9953; AVX512BW-NEXT: korw %k6, %k5, %k5 9954; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 9955; AVX512BW-NEXT: kandw %k3, %k5, %k5 9956; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 9957; AVX512BW-NEXT: korw %k6, %k5, %k5 9958; AVX512BW-NEXT: kandw %k1, %k5, %k5 9959; AVX512BW-NEXT: kshiftrw $6, %k2, %k6 9960; AVX512BW-NEXT: korw %k6, %k5, %k5 9961; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 9962; AVX512BW-NEXT: kandw %k1, %k5, %k5 9963; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 9964; AVX512BW-NEXT: korw %k6, %k5, %k5 9965; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 9966; AVX512BW-NEXT: kandw %k1, %k5, %k5 9967; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 9968; AVX512BW-NEXT: korw %k6, %k5, %k5 9969; AVX512BW-NEXT: kandw %k4, %k5, %k5 9970; AVX512BW-NEXT: kshiftrw $3, %k2, %k2 9971; AVX512BW-NEXT: korw %k2, %k5, %k2 9972; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 9973; AVX512BW-NEXT: kandw %k4, %k2, %k2 9974; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 9975; AVX512BW-NEXT: kshiftrw $2, %k6, %k5 9976; AVX512BW-NEXT: korw %k5, %k2, %k2 9977; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 9978; AVX512BW-NEXT: kandw %k1, %k2, %k2 9979; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload 9980; AVX512BW-NEXT: kshiftlw $14, %k1, %k3 9981; AVX512BW-NEXT: korw %k3, %k2, %k2 9982; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 9983; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 9984; AVX512BW-NEXT: korw %k6, %k2, %k1 9985; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k1} {z} 9986; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload 9987; AVX512BW-NEXT: kshiftrd $23, %k6, %k1 9988; AVX512BW-NEXT: kshiftlw $15, %k1, %k3 9989; AVX512BW-NEXT: kshiftrd $22, %k6, %k5 9990; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 9991; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 9992; AVX512BW-NEXT: kandw %k1, %k5, %k2 9993; AVX512BW-NEXT: kshiftrw $14, %k3, %k5 9994; AVX512BW-NEXT: korw %k5, %k2, %k2 9995; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 9996; AVX512BW-NEXT: kandw %k1, %k2, %k2 9997; AVX512BW-NEXT: kshiftrw $13, %k3, %k5 9998; AVX512BW-NEXT: korw %k5, %k2, %k2 9999; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10000; AVX512BW-NEXT: kandw %k1, %k2, %k2 10001; AVX512BW-NEXT: kshiftrw $12, %k3, %k5 10002; AVX512BW-NEXT: korw %k5, %k2, %k2 10003; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 10004; AVX512BW-NEXT: kandw %k5, %k2, %k2 10005; AVX512BW-NEXT: kshiftrw $11, %k3, %k5 10006; AVX512BW-NEXT: korw %k5, %k2, %k2 10007; AVX512BW-NEXT: kandw %k7, %k2, %k2 10008; AVX512BW-NEXT: kshiftrw $10, %k3, %k5 10009; AVX512BW-NEXT: korw %k5, %k2, %k2 10010; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 10011; AVX512BW-NEXT: kandw %k7, %k2, %k2 10012; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 10013; AVX512BW-NEXT: korw %k5, %k2, %k2 10014; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 10015; AVX512BW-NEXT: kandw %k5, %k2, %k2 10016; AVX512BW-NEXT: kshiftrw $8, %k3, %k3 10017; AVX512BW-NEXT: korw %k3, %k2, %k2 10018; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 10019; AVX512BW-NEXT: kandw %k3, %k2, %k2 10020; AVX512BW-NEXT: kshiftrd $24, %k6, %k3 10021; AVX512BW-NEXT: kshiftlw $15, %k3, %k5 10022; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 10023; AVX512BW-NEXT: korw %k6, %k2, %k2 10024; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 10025; AVX512BW-NEXT: kandw %k6, %k2, %k2 10026; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 10027; AVX512BW-NEXT: korw %k6, %k2, %k2 10028; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 10029; AVX512BW-NEXT: kandw %k6, %k2, %k2 10030; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 10031; AVX512BW-NEXT: korw %k6, %k2, %k2 10032; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 10033; AVX512BW-NEXT: kandw %k6, %k2, %k2 10034; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 10035; AVX512BW-NEXT: korw %k6, %k2, %k2 10036; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 10037; AVX512BW-NEXT: kandw %k6, %k2, %k2 10038; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 10039; AVX512BW-NEXT: korw %k6, %k2, %k2 10040; AVX512BW-NEXT: kandw %k4, %k2, %k2 10041; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 10042; AVX512BW-NEXT: korw %k5, %k2, %k2 10043; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 10044; AVX512BW-NEXT: kandw %k4, %k2, %k2 10045; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 10046; AVX512BW-NEXT: korw %k3, %k2, %k2 10047; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 10048; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 10049; AVX512BW-NEXT: korw %k0, %k2, %k2 10050; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k2} {z} 10051; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload 10052; AVX512BW-NEXT: kshiftrd $20, %k3, %k5 10053; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 10054; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 10055; AVX512BW-NEXT: kandw %k0, %k5, %k2 10056; AVX512BW-NEXT: kshiftlw $15, %k5, %k6 10057; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 10058; AVX512BW-NEXT: kshiftrw $14, %k6, %k5 10059; AVX512BW-NEXT: korw %k5, %k2, %k2 10060; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 10061; AVX512BW-NEXT: kandw %k0, %k2, %k2 10062; AVX512BW-NEXT: kshiftrw $13, %k6, %k5 10063; AVX512BW-NEXT: korw %k5, %k2, %k2 10064; AVX512BW-NEXT: kandw %k1, %k2, %k5 10065; AVX512BW-NEXT: kshiftrd $21, %k3, %k2 10066; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 10067; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 10068; AVX512BW-NEXT: korw %k6, %k5, %k5 10069; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 10070; AVX512BW-NEXT: kandw %k0, %k5, %k5 10071; AVX512BW-NEXT: kshiftrw $11, %k2, %k6 10072; AVX512BW-NEXT: korw %k6, %k5, %k5 10073; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 10074; AVX512BW-NEXT: kandw %k0, %k5, %k5 10075; AVX512BW-NEXT: kshiftrw $10, %k2, %k6 10076; AVX512BW-NEXT: korw %k6, %k5, %k5 10077; AVX512BW-NEXT: kandw %k7, %k5, %k5 10078; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 10079; AVX512BW-NEXT: korw %k6, %k5, %k5 10080; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 10081; AVX512BW-NEXT: kandw %k0, %k5, %k5 10082; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 10083; AVX512BW-NEXT: korw %k6, %k5, %k5 10084; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 10085; AVX512BW-NEXT: kandw %k0, %k5, %k5 10086; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 10087; AVX512BW-NEXT: korw %k6, %k5, %k5 10088; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 10089; AVX512BW-NEXT: kandw %k0, %k5, %k5 10090; AVX512BW-NEXT: kshiftrw $6, %k2, %k2 10091; AVX512BW-NEXT: korw %k2, %k5, %k2 10092; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 10093; AVX512BW-NEXT: kandw %k0, %k2, %k5 10094; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload 10095; AVX512BW-NEXT: kshiftlw $15, %k7, %k2 10096; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 10097; AVX512BW-NEXT: korw %k6, %k5, %k5 10098; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 10099; AVX512BW-NEXT: kandw %k3, %k5, %k5 10100; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 10101; AVX512BW-NEXT: korw %k6, %k5, %k5 10102; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 10103; AVX512BW-NEXT: kandw %k0, %k5, %k5 10104; AVX512BW-NEXT: kshiftrw $3, %k2, %k6 10105; AVX512BW-NEXT: korw %k6, %k5, %k5 10106; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10107; AVX512BW-NEXT: kandw %k1, %k5, %k5 10108; AVX512BW-NEXT: kshiftrw $2, %k2, %k6 10109; AVX512BW-NEXT: korw %k6, %k5, %k5 10110; AVX512BW-NEXT: kandw %k4, %k5, %k5 10111; AVX512BW-NEXT: kshiftlw $14, %k7, %k1 10112; AVX512BW-NEXT: korw %k1, %k5, %k1 10113; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 10114; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 10115; AVX512BW-NEXT: korw %k2, %k1, %k1 10116; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm5 {%k1} {z} 10117; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload 10118; AVX512BW-NEXT: kshiftrd $18, %k2, %k4 10119; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 10120; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10121; AVX512BW-NEXT: kandw %k1, %k4, %k5 10122; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 10123; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 10124; AVX512BW-NEXT: kshiftrw $14, %k4, %k6 10125; AVX512BW-NEXT: korw %k6, %k5, %k5 10126; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10127; AVX512BW-NEXT: kandw %k1, %k5, %k5 10128; AVX512BW-NEXT: kshiftrw $13, %k4, %k6 10129; AVX512BW-NEXT: korw %k6, %k5, %k5 10130; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10131; AVX512BW-NEXT: kandw %k1, %k5, %k5 10132; AVX512BW-NEXT: kshiftrw $12, %k4, %k6 10133; AVX512BW-NEXT: korw %k6, %k5, %k5 10134; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10135; AVX512BW-NEXT: kandw %k1, %k5, %k5 10136; AVX512BW-NEXT: kshiftrw $11, %k4, %k6 10137; AVX512BW-NEXT: korw %k6, %k5, %k5 10138; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10139; AVX512BW-NEXT: kandw %k1, %k5, %k6 10140; AVX512BW-NEXT: kshiftrd $19, %k2, %k5 10141; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 10142; AVX512BW-NEXT: kshiftrw $10, %k5, %k7 10143; AVX512BW-NEXT: korw %k7, %k6, %k6 10144; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10145; AVX512BW-NEXT: kandw %k1, %k6, %k6 10146; AVX512BW-NEXT: kshiftrw $9, %k5, %k7 10147; AVX512BW-NEXT: korw %k7, %k6, %k6 10148; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 10149; AVX512BW-NEXT: kandw %k4, %k6, %k6 10150; AVX512BW-NEXT: kshiftrw $8, %k5, %k7 10151; AVX512BW-NEXT: korw %k7, %k6, %k6 10152; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 10153; AVX512BW-NEXT: kandw %k2, %k6, %k6 10154; AVX512BW-NEXT: kshiftrw $7, %k5, %k7 10155; AVX512BW-NEXT: korw %k7, %k6, %k6 10156; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 10157; AVX512BW-NEXT: kandw %k2, %k6, %k6 10158; AVX512BW-NEXT: kshiftrw $6, %k5, %k7 10159; AVX512BW-NEXT: korw %k7, %k6, %k6 10160; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 10161; AVX512BW-NEXT: kandw %k2, %k6, %k6 10162; AVX512BW-NEXT: kshiftrw $5, %k5, %k7 10163; AVX512BW-NEXT: korw %k7, %k6, %k6 10164; AVX512BW-NEXT: kandw %k3, %k6, %k6 10165; AVX512BW-NEXT: kshiftrw $4, %k5, %k5 10166; AVX512BW-NEXT: korw %k5, %k6, %k5 10167; AVX512BW-NEXT: kandw %k0, %k5, %k5 10168; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 10169; AVX512BW-NEXT: kshiftrw $3, %k7, %k6 10170; AVX512BW-NEXT: korw %k6, %k5, %k5 10171; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 10172; AVX512BW-NEXT: kandw %k0, %k5, %k5 10173; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 10174; AVX512BW-NEXT: korw %k6, %k5, %k5 10175; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 10176; AVX512BW-NEXT: kandw %k0, %k5, %k5 10177; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload 10178; AVX512BW-NEXT: kshiftlw $14, %k0, %k3 10179; AVX512BW-NEXT: korw %k3, %k5, %k3 10180; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 10181; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 10182; AVX512BW-NEXT: korw %k7, %k3, %k3 10183; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm6 {%k3} {z} 10184; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload 10185; AVX512BW-NEXT: kshiftrd $16, %k1, %k0 10186; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 10187; AVX512BW-NEXT: kandw %k6, %k0, %k3 10188; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 10189; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 10190; AVX512BW-NEXT: korw %k5, %k3, %k3 10191; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 10192; AVX512BW-NEXT: kandw %k7, %k3, %k3 10193; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 10194; AVX512BW-NEXT: korw %k5, %k3, %k3 10195; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 10196; AVX512BW-NEXT: kandw %k5, %k3, %k3 10197; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 10198; AVX512BW-NEXT: korw %k5, %k3, %k3 10199; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 10200; AVX512BW-NEXT: kandw %k5, %k3, %k3 10201; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 10202; AVX512BW-NEXT: korw %k5, %k3, %k3 10203; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 10204; AVX512BW-NEXT: kandw %k5, %k3, %k3 10205; AVX512BW-NEXT: kshiftrw $10, %k0, %k5 10206; AVX512BW-NEXT: korw %k5, %k3, %k3 10207; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 10208; AVX512BW-NEXT: kandw %k5, %k3, %k3 10209; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 10210; AVX512BW-NEXT: korw %k0, %k3, %k0 10211; AVX512BW-NEXT: kandw %k4, %k0, %k3 10212; AVX512BW-NEXT: kshiftrd $17, %k1, %k0 10213; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 10214; AVX512BW-NEXT: kshiftrw $8, %k0, %k5 10215; AVX512BW-NEXT: korw %k5, %k3, %k3 10216; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 10217; AVX512BW-NEXT: kandw %k4, %k3, %k3 10218; AVX512BW-NEXT: kshiftrw $7, %k0, %k5 10219; AVX512BW-NEXT: korw %k5, %k3, %k3 10220; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10221; AVX512BW-NEXT: kandw %k1, %k3, %k3 10222; AVX512BW-NEXT: kshiftrw $6, %k0, %k5 10223; AVX512BW-NEXT: korw %k5, %k3, %k3 10224; AVX512BW-NEXT: kandw %k2, %k3, %k3 10225; AVX512BW-NEXT: kshiftrw $5, %k0, %k5 10226; AVX512BW-NEXT: korw %k5, %k3, %k3 10227; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10228; AVX512BW-NEXT: kandw %k1, %k3, %k3 10229; AVX512BW-NEXT: kshiftrw $4, %k0, %k5 10230; AVX512BW-NEXT: korw %k5, %k3, %k3 10231; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10232; AVX512BW-NEXT: kandw %k1, %k3, %k3 10233; AVX512BW-NEXT: kshiftrw $3, %k0, %k5 10234; AVX512BW-NEXT: korw %k5, %k3, %k3 10235; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10236; AVX512BW-NEXT: kandw %k1, %k3, %k3 10237; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 10238; AVX512BW-NEXT: korw %k0, %k3, %k0 10239; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10240; AVX512BW-NEXT: kandw %k1, %k0, %k0 10241; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload 10242; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 10243; AVX512BW-NEXT: korw %k2, %k0, %k0 10244; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 10245; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 10246; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10247; AVX512BW-NEXT: korw %k1, %k0, %k1 10248; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} 10249; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload 10250; AVX512BW-NEXT: kshiftrd $13, %k0, %k2 10251; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 10252; AVX512BW-NEXT: kandw %k6, %k2, %k1 10253; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 10254; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 10255; AVX512BW-NEXT: kshiftrw $14, %k2, %k3 10256; AVX512BW-NEXT: korw %k3, %k1, %k1 10257; AVX512BW-NEXT: kandw %k7, %k1, %k3 10258; AVX512BW-NEXT: kshiftrd $14, %k0, %k1 10259; AVX512BW-NEXT: kmovq %k0, %k6 10260; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 10261; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 10262; AVX512BW-NEXT: korw %k5, %k3, %k3 10263; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 10264; AVX512BW-NEXT: kandw %k7, %k3, %k3 10265; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 10266; AVX512BW-NEXT: korw %k5, %k3, %k3 10267; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 10268; AVX512BW-NEXT: kandw %k0, %k3, %k3 10269; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 10270; AVX512BW-NEXT: korw %k5, %k3, %k3 10271; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 10272; AVX512BW-NEXT: kandw %k2, %k3, %k3 10273; AVX512BW-NEXT: kshiftrw $10, %k1, %k5 10274; AVX512BW-NEXT: korw %k5, %k3, %k3 10275; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 10276; AVX512BW-NEXT: kandw %k2, %k3, %k3 10277; AVX512BW-NEXT: kshiftrw $9, %k1, %k5 10278; AVX512BW-NEXT: korw %k5, %k3, %k3 10279; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 10280; AVX512BW-NEXT: kandw %k2, %k3, %k3 10281; AVX512BW-NEXT: kshiftrw $8, %k1, %k5 10282; AVX512BW-NEXT: korw %k5, %k3, %k3 10283; AVX512BW-NEXT: kandw %k4, %k3, %k3 10284; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 10285; AVX512BW-NEXT: korw %k1, %k3, %k1 10286; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 10287; AVX512BW-NEXT: kandw %k4, %k1, %k5 10288; AVX512BW-NEXT: kshiftrd $15, %k6, %k3 10289; AVX512BW-NEXT: kmovq %k6, %k0 10290; AVX512BW-NEXT: kshiftlw $15, %k3, %k1 10291; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 10292; AVX512BW-NEXT: korw %k6, %k5, %k5 10293; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 10294; AVX512BW-NEXT: kandw %k2, %k5, %k5 10295; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 10296; AVX512BW-NEXT: korw %k6, %k5, %k5 10297; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 10298; AVX512BW-NEXT: kandw %k2, %k5, %k5 10299; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 10300; AVX512BW-NEXT: korw %k6, %k5, %k5 10301; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 10302; AVX512BW-NEXT: kandw %k2, %k5, %k5 10303; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 10304; AVX512BW-NEXT: korw %k6, %k5, %k5 10305; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 10306; AVX512BW-NEXT: kandw %k2, %k5, %k5 10307; AVX512BW-NEXT: kshiftrw $2, %k1, %k6 10308; AVX512BW-NEXT: korw %k6, %k5, %k5 10309; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 10310; AVX512BW-NEXT: kandw %k6, %k5, %k5 10311; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 10312; AVX512BW-NEXT: korw %k3, %k5, %k3 10313; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 10314; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 10315; AVX512BW-NEXT: korw %k1, %k3, %k1 10316; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm8 {%k1} {z} 10317; AVX512BW-NEXT: kmovq %k0, %k3 10318; AVX512BW-NEXT: kshiftrd $11, %k0, %k0 10319; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 10320; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10321; AVX512BW-NEXT: kandw %k1, %k0, %k5 10322; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 10323; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 10324; AVX512BW-NEXT: korw %k6, %k5, %k5 10325; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10326; AVX512BW-NEXT: kandw %k1, %k5, %k5 10327; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 10328; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 10329; AVX512BW-NEXT: korw %k6, %k5, %k5 10330; AVX512BW-NEXT: kandw %k7, %k5, %k5 10331; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 10332; AVX512BW-NEXT: korw %k6, %k5, %k5 10333; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 10334; AVX512BW-NEXT: kandw %k0, %k5, %k6 10335; AVX512BW-NEXT: kshiftrd $12, %k3, %k5 10336; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 10337; AVX512BW-NEXT: kshiftrw $11, %k5, %k7 10338; AVX512BW-NEXT: korw %k7, %k6, %k6 10339; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10340; AVX512BW-NEXT: kandw %k1, %k6, %k6 10341; AVX512BW-NEXT: kshiftrw $10, %k5, %k7 10342; AVX512BW-NEXT: korw %k7, %k6, %k6 10343; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 10344; AVX512BW-NEXT: kandw %k0, %k6, %k6 10345; AVX512BW-NEXT: kshiftrw $9, %k5, %k7 10346; AVX512BW-NEXT: korw %k7, %k6, %k6 10347; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 10348; AVX512BW-NEXT: kandw %k0, %k6, %k6 10349; AVX512BW-NEXT: kshiftrw $8, %k5, %k7 10350; AVX512BW-NEXT: korw %k7, %k6, %k6 10351; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 10352; AVX512BW-NEXT: kandw %k0, %k6, %k6 10353; AVX512BW-NEXT: kshiftrw $7, %k5, %k7 10354; AVX512BW-NEXT: korw %k7, %k6, %k6 10355; AVX512BW-NEXT: kandw %k4, %k6, %k6 10356; AVX512BW-NEXT: kshiftrw $6, %k5, %k7 10357; AVX512BW-NEXT: korw %k7, %k6, %k6 10358; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 10359; AVX512BW-NEXT: kandw %k4, %k6, %k6 10360; AVX512BW-NEXT: kshiftrw $5, %k5, %k5 10361; AVX512BW-NEXT: korw %k5, %k6, %k5 10362; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 10363; AVX512BW-NEXT: kandw %k0, %k5, %k5 10364; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 10365; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 10366; AVX512BW-NEXT: korw %k6, %k5, %k5 10367; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 10368; AVX512BW-NEXT: kandw %k0, %k5, %k5 10369; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 10370; AVX512BW-NEXT: korw %k6, %k5, %k5 10371; AVX512BW-NEXT: kandw %k2, %k5, %k5 10372; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 10373; AVX512BW-NEXT: kmovq %k3, %k0 10374; AVX512BW-NEXT: korw %k6, %k5, %k5 10375; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 10376; AVX512BW-NEXT: kandw %k3, %k5, %k5 10377; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload 10378; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 10379; AVX512BW-NEXT: korw %k2, %k5, %k2 10380; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 10381; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 10382; AVX512BW-NEXT: korw %k0, %k2, %k2 10383; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm9 {%k2} {z} 10384; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload 10385; AVX512BW-NEXT: kshiftrd $9, %k6, %k0 10386; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 10387; AVX512BW-NEXT: kandw %k2, %k0, %k2 10388; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 10389; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 10390; AVX512BW-NEXT: korw %k5, %k2, %k2 10391; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 10392; AVX512BW-NEXT: kandw %k5, %k2, %k2 10393; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 10394; AVX512BW-NEXT: korw %k5, %k2, %k2 10395; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 10396; AVX512BW-NEXT: kandw %k5, %k2, %k2 10397; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 10398; AVX512BW-NEXT: korw %k5, %k2, %k2 10399; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 10400; AVX512BW-NEXT: kandw %k7, %k2, %k2 10401; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 10402; AVX512BW-NEXT: korw %k5, %k2, %k2 10403; AVX512BW-NEXT: kandw %k1, %k2, %k2 10404; AVX512BW-NEXT: kshiftrw $10, %k0, %k5 10405; AVX512BW-NEXT: korw %k5, %k2, %k2 10406; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10407; AVX512BW-NEXT: kandw %k1, %k2, %k5 10408; AVX512BW-NEXT: kshiftrd $10, %k6, %k2 10409; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 10410; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 10411; AVX512BW-NEXT: korw %k6, %k5, %k5 10412; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10413; AVX512BW-NEXT: kandw %k1, %k5, %k5 10414; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 10415; AVX512BW-NEXT: korw %k6, %k5, %k5 10416; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10417; AVX512BW-NEXT: kandw %k1, %k5, %k5 10418; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 10419; AVX512BW-NEXT: korw %k6, %k5, %k5 10420; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10421; AVX512BW-NEXT: kandw %k1, %k5, %k5 10422; AVX512BW-NEXT: kshiftrw $6, %k2, %k6 10423; AVX512BW-NEXT: korw %k6, %k5, %k5 10424; AVX512BW-NEXT: kandw %k4, %k5, %k5 10425; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 10426; AVX512BW-NEXT: korw %k6, %k5, %k5 10427; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10428; AVX512BW-NEXT: kandw %k1, %k5, %k5 10429; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 10430; AVX512BW-NEXT: korw %k6, %k5, %k5 10431; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10432; AVX512BW-NEXT: kandw %k1, %k5, %k5 10433; AVX512BW-NEXT: kshiftrw $3, %k2, %k2 10434; AVX512BW-NEXT: korw %k2, %k5, %k2 10435; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10436; AVX512BW-NEXT: kandw %k1, %k2, %k2 10437; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 10438; AVX512BW-NEXT: kshiftrw $2, %k4, %k5 10439; AVX512BW-NEXT: korw %k5, %k2, %k2 10440; AVX512BW-NEXT: kandw %k3, %k2, %k2 10441; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload 10442; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 10443; AVX512BW-NEXT: korw %k1, %k2, %k1 10444; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 10445; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 10446; AVX512BW-NEXT: korw %k4, %k1, %k1 10447; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} 10448; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 4-byte Reload 10449; AVX512BW-NEXT: kshiftrd $7, %k4, %k1 10450; AVX512BW-NEXT: kshiftlw $15, %k1, %k3 10451; AVX512BW-NEXT: kshiftrd $6, %k4, %k2 10452; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 10453; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10454; AVX512BW-NEXT: kandw %k1, %k2, %k2 10455; AVX512BW-NEXT: kshiftrw $14, %k3, %k5 10456; AVX512BW-NEXT: korw %k5, %k2, %k2 10457; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10458; AVX512BW-NEXT: kandw %k1, %k2, %k2 10459; AVX512BW-NEXT: kshiftrw $13, %k3, %k5 10460; AVX512BW-NEXT: korw %k5, %k2, %k2 10461; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10462; AVX512BW-NEXT: kandw %k1, %k2, %k2 10463; AVX512BW-NEXT: kshiftrw $12, %k3, %k5 10464; AVX512BW-NEXT: korw %k5, %k2, %k2 10465; AVX512BW-NEXT: kandw %k7, %k2, %k2 10466; AVX512BW-NEXT: kshiftrw $11, %k3, %k5 10467; AVX512BW-NEXT: korw %k5, %k2, %k2 10468; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10469; AVX512BW-NEXT: kandw %k1, %k2, %k2 10470; AVX512BW-NEXT: kshiftrw $10, %k3, %k5 10471; AVX512BW-NEXT: korw %k5, %k2, %k2 10472; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10473; AVX512BW-NEXT: kandw %k1, %k2, %k2 10474; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 10475; AVX512BW-NEXT: korw %k5, %k2, %k2 10476; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10477; AVX512BW-NEXT: kandw %k1, %k2, %k2 10478; AVX512BW-NEXT: kshiftrw $8, %k3, %k3 10479; AVX512BW-NEXT: korw %k3, %k2, %k2 10480; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10481; AVX512BW-NEXT: kandw %k1, %k2, %k2 10482; AVX512BW-NEXT: kshiftrd $8, %k4, %k3 10483; AVX512BW-NEXT: kshiftlw $15, %k3, %k5 10484; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 10485; AVX512BW-NEXT: korw %k6, %k2, %k2 10486; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 10487; AVX512BW-NEXT: kandw %k1, %k2, %k2 10488; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 10489; AVX512BW-NEXT: korw %k6, %k2, %k2 10490; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 10491; AVX512BW-NEXT: kandw %k4, %k2, %k2 10492; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 10493; AVX512BW-NEXT: korw %k6, %k2, %k2 10494; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 10495; AVX512BW-NEXT: kandw %k7, %k2, %k2 10496; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 10497; AVX512BW-NEXT: korw %k6, %k2, %k2 10498; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 10499; AVX512BW-NEXT: kandw %k6, %k2, %k2 10500; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 10501; AVX512BW-NEXT: korw %k6, %k2, %k2 10502; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 10503; AVX512BW-NEXT: kandw %k6, %k2, %k2 10504; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 10505; AVX512BW-NEXT: korw %k5, %k2, %k2 10506; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 10507; AVX512BW-NEXT: kandw %k5, %k2, %k2 10508; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 10509; AVX512BW-NEXT: korw %k3, %k2, %k2 10510; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 10511; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 10512; AVX512BW-NEXT: korw %k0, %k2, %k2 10513; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k2} {z} 10514; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload 10515; AVX512BW-NEXT: kshiftrd $4, %k6, %k3 10516; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 10517; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 10518; AVX512BW-NEXT: kandw %k0, %k3, %k2 10519; AVX512BW-NEXT: kshiftlw $15, %k3, %k0 10520; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 10521; AVX512BW-NEXT: korw %k5, %k2, %k2 10522; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 10523; AVX512BW-NEXT: kandw %k3, %k2, %k2 10524; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 10525; AVX512BW-NEXT: korw %k5, %k2, %k2 10526; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 10527; AVX512BW-NEXT: kandw %k3, %k2, %k5 10528; AVX512BW-NEXT: kshiftrd $5, %k6, %k2 10529; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 10530; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 10531; AVX512BW-NEXT: korw %k6, %k5, %k5 10532; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 10533; AVX512BW-NEXT: kandw %k3, %k5, %k5 10534; AVX512BW-NEXT: kshiftrw $11, %k2, %k6 10535; AVX512BW-NEXT: korw %k6, %k5, %k5 10536; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 10537; AVX512BW-NEXT: kandw %k3, %k5, %k5 10538; AVX512BW-NEXT: kshiftrw $10, %k2, %k6 10539; AVX512BW-NEXT: korw %k6, %k5, %k5 10540; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 10541; AVX512BW-NEXT: kandw %k3, %k5, %k5 10542; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 10543; AVX512BW-NEXT: korw %k6, %k5, %k5 10544; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 10545; AVX512BW-NEXT: kandw %k3, %k5, %k5 10546; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 10547; AVX512BW-NEXT: korw %k6, %k5, %k5 10548; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 10549; AVX512BW-NEXT: kandw %k3, %k5, %k5 10550; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 10551; AVX512BW-NEXT: korw %k6, %k5, %k5 10552; AVX512BW-NEXT: kandw %k1, %k5, %k5 10553; AVX512BW-NEXT: kshiftrw $6, %k2, %k2 10554; AVX512BW-NEXT: korw %k2, %k5, %k2 10555; AVX512BW-NEXT: kandw %k4, %k2, %k5 10556; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload 10557; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 10558; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 10559; AVX512BW-NEXT: korw %k6, %k5, %k5 10560; AVX512BW-NEXT: kandw %k7, %k5, %k5 10561; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 10562; AVX512BW-NEXT: korw %k6, %k5, %k5 10563; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 10564; AVX512BW-NEXT: kandw %k3, %k5, %k5 10565; AVX512BW-NEXT: kshiftrw $3, %k2, %k6 10566; AVX512BW-NEXT: korw %k6, %k5, %k5 10567; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 10568; AVX512BW-NEXT: kandw %k7, %k5, %k5 10569; AVX512BW-NEXT: kshiftrw $2, %k2, %k6 10570; AVX512BW-NEXT: korw %k6, %k5, %k5 10571; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 10572; AVX512BW-NEXT: kandw %k6, %k5, %k5 10573; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 10574; AVX512BW-NEXT: korw %k1, %k5, %k1 10575; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 10576; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 10577; AVX512BW-NEXT: korw %k2, %k1, %k1 10578; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm12 {%k1} {z} 10579; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload 10580; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 10581; AVX512BW-NEXT: kandw %k2, %k1, %k1 10582; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 10583; AVX512BW-NEXT: kshiftrw $14, %k4, %k2 10584; AVX512BW-NEXT: korw %k2, %k1, %k1 10585; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 10586; AVX512BW-NEXT: kandw %k2, %k1, %k1 10587; AVX512BW-NEXT: kshiftrw $13, %k4, %k2 10588; AVX512BW-NEXT: korw %k2, %k1, %k1 10589; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 10590; AVX512BW-NEXT: kandw %k2, %k1, %k1 10591; AVX512BW-NEXT: kshiftrw $12, %k4, %k2 10592; AVX512BW-NEXT: korw %k2, %k1, %k1 10593; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 10594; AVX512BW-NEXT: kandw %k2, %k1, %k1 10595; AVX512BW-NEXT: kshiftrw $11, %k4, %k2 10596; AVX512BW-NEXT: korw %k2, %k1, %k1 10597; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 10598; AVX512BW-NEXT: kandw %k2, %k1, %k2 10599; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload 10600; AVX512BW-NEXT: kshiftrd $3, %k1, %k1 10601; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 10602; AVX512BW-NEXT: kshiftrw $10, %k1, %k4 10603; AVX512BW-NEXT: korw %k4, %k2, %k2 10604; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 10605; AVX512BW-NEXT: kandw %k4, %k2, %k2 10606; AVX512BW-NEXT: kshiftrw $9, %k1, %k4 10607; AVX512BW-NEXT: korw %k4, %k2, %k2 10608; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 10609; AVX512BW-NEXT: kandw %k4, %k2, %k2 10610; AVX512BW-NEXT: kshiftrw $8, %k1, %k4 10611; AVX512BW-NEXT: korw %k4, %k2, %k2 10612; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 10613; AVX512BW-NEXT: kandw %k4, %k2, %k2 10614; AVX512BW-NEXT: kshiftrw $7, %k1, %k4 10615; AVX512BW-NEXT: korw %k4, %k2, %k2 10616; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 10617; AVX512BW-NEXT: kandw %k4, %k2, %k2 10618; AVX512BW-NEXT: kshiftrw $6, %k1, %k4 10619; AVX512BW-NEXT: korw %k4, %k2, %k2 10620; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 10621; AVX512BW-NEXT: kandw %k4, %k2, %k2 10622; AVX512BW-NEXT: kshiftrw $5, %k1, %k4 10623; AVX512BW-NEXT: korw %k4, %k2, %k2 10624; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 10625; AVX512BW-NEXT: kandw %k4, %k2, %k2 10626; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 10627; AVX512BW-NEXT: korw %k1, %k2, %k1 10628; AVX512BW-NEXT: kandw %k3, %k1, %k1 10629; AVX512BW-NEXT: kshiftrw $3, %k0, %k2 10630; AVX512BW-NEXT: korw %k2, %k1, %k1 10631; AVX512BW-NEXT: kandw %k7, %k1, %k1 10632; AVX512BW-NEXT: kshiftrw $2, %k0, %k2 10633; AVX512BW-NEXT: korw %k2, %k1, %k1 10634; AVX512BW-NEXT: kandw %k6, %k1, %k1 10635; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload 10636; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 10637; AVX512BW-NEXT: korw %k2, %k1, %k1 10638; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 10639; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 10640; AVX512BW-NEXT: korw %k0, %k1, %k1 10641; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm13 {%k1} {z} 10642; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rdx) 10643; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rdx) 10644; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%rdx) 10645; AVX512BW-NEXT: vmovdqa64 %zmm10, 256(%rdx) 10646; AVX512BW-NEXT: vmovdqa64 %zmm9, 320(%rdx) 10647; AVX512BW-NEXT: vmovdqa64 %zmm8, 384(%rdx) 10648; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) 10649; AVX512BW-NEXT: vmovdqa64 %zmm6, 512(%rdx) 10650; AVX512BW-NEXT: vmovdqa64 %zmm5, 576(%rdx) 10651; AVX512BW-NEXT: vmovdqa64 %zmm4, 640(%rdx) 10652; AVX512BW-NEXT: vmovdqa64 %zmm3, 704(%rdx) 10653; AVX512BW-NEXT: vmovdqa64 %zmm2, 768(%rdx) 10654; AVX512BW-NEXT: vmovdqa64 %zmm1, 832(%rdx) 10655; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) 10656; AVX512BW-NEXT: vzeroupper 10657; AVX512BW-NEXT: retq 10658 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 10659 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 10660 %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <224 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> 10661 %data = call <224 x i32> @llvm.masked.load.v224i32.p0(ptr %in.vec, i32 64, <224 x i1> %tgt.mask, <224 x i32> poison) 10662 store <224 x i32> %data, ptr %out.vec, align 64 10663 ret void 10664} 10665 10666define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 10667; AVX512F-ONLY-LABEL: mask_replication_factor7_vf64: 10668; AVX512F-ONLY: # %bb.0: 10669; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 10670; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 10671; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] 10672; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm0 10673; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 10674; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 10675; AVX512F-ONLY-NEXT: movw $1, %ax 10676; AVX512F-ONLY-NEXT: kmovw %eax, %k1 10677; AVX512F-ONLY-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} 10678; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 10679; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm8 {%k1} {z} = -1 10680; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 10681; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm9 {%k1} {z} = -1 10682; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 10683; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm11 {%k1} {z} = -1 10684; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 10685; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] 10686; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm13, %zmm0 10687; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm15 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] 10688; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm15, %zmm2 10689; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm16 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] 10690; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm16, %zmm3 10691; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm17 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] 10692; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm17, %zmm4 10693; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm18 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] 10694; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm18, %zmm6 10695; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm19 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] 10696; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm19, %zmm7 10697; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm5, %zmm8 10698; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm13, %zmm10 10699; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm15, %zmm12 10700; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm16, %zmm14 10701; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm17, %zmm20 10702; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm18, %zmm21 10703; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm19, %zmm22 10704; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm5, %zmm23 10705; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm13, %zmm24 10706; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm15, %zmm25 10707; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm16, %zmm26 10708; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm17, %zmm27 10709; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm18, %zmm28 10710; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm5, %zmm29 10711; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm19, %zmm30 10712; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm13, %zmm31 10713; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm15, %zmm15 10714; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm16, %zmm13 10715; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm17, %zmm11 10716; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm18, %zmm9 10717; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm19, %zmm5 10718; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} 10719; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 10720; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} 10721; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 10722; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} 10723; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 10724; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} 10725; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 10726; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm13 {%k1} {z} 10727; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 10728; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm15 {%k1} {z} 10729; AVX512F-ONLY-NEXT: vptestmd %zmm31, %zmm31, %k1 10730; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm16 {%k1} {z} 10731; AVX512F-ONLY-NEXT: vptestmd %zmm29, %zmm29, %k1 10732; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm17 {%k1} {z} 10733; AVX512F-ONLY-NEXT: vptestmd %zmm30, %zmm30, %k1 10734; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z} 10735; AVX512F-ONLY-NEXT: vptestmd %zmm28, %zmm28, %k1 10736; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm19 {%k1} {z} 10737; AVX512F-ONLY-NEXT: vptestmd %zmm27, %zmm27, %k1 10738; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm27 {%k1} {z} 10739; AVX512F-ONLY-NEXT: vptestmd %zmm26, %zmm26, %k1 10740; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm26 {%k1} {z} 10741; AVX512F-ONLY-NEXT: vptestmd %zmm25, %zmm25, %k1 10742; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm25 {%k1} {z} 10743; AVX512F-ONLY-NEXT: vptestmd %zmm24, %zmm24, %k1 10744; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm24 {%k1} {z} 10745; AVX512F-ONLY-NEXT: vptestmd %zmm23, %zmm23, %k1 10746; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm23 {%k1} {z} 10747; AVX512F-ONLY-NEXT: vptestmd %zmm22, %zmm22, %k1 10748; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm22 {%k1} {z} 10749; AVX512F-ONLY-NEXT: vptestmd %zmm21, %zmm21, %k1 10750; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm21 {%k1} {z} 10751; AVX512F-ONLY-NEXT: vptestmd %zmm20, %zmm20, %k1 10752; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm20 {%k1} {z} 10753; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 10754; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm14 {%k1} {z} 10755; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 10756; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm12 {%k1} {z} 10757; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 10758; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm10 {%k1} {z} 10759; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 10760; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm8 {%k1} {z} 10761; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 10762; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm7 {%k1} {z} 10763; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 10764; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm6 {%k1} {z} 10765; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 10766; AVX512F-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm4 {%k1} {z} 10767; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 10768; AVX512F-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm3 {%k1} {z} 10769; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 10770; AVX512F-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm2 {%k1} {z} 10771; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 10772; AVX512F-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm0 {%k1} {z} 10773; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 1728(%rdx) 10774; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 1664(%rdx) 10775; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 1600(%rdx) 10776; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 1536(%rdx) 10777; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1472(%rdx) 10778; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 1408(%rdx) 10779; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1344(%rdx) 10780; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1280(%rdx) 10781; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1216(%rdx) 10782; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 1152(%rdx) 10783; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 1088(%rdx) 10784; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 1024(%rdx) 10785; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 960(%rdx) 10786; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 896(%rdx) 10787; AVX512F-ONLY-NEXT: vmovdqa64 %zmm24, 832(%rdx) 10788; AVX512F-ONLY-NEXT: vmovdqa64 %zmm25, 768(%rdx) 10789; AVX512F-ONLY-NEXT: vmovdqa64 %zmm26, 704(%rdx) 10790; AVX512F-ONLY-NEXT: vmovdqa64 %zmm27, 640(%rdx) 10791; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 576(%rdx) 10792; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 512(%rdx) 10793; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 448(%rdx) 10794; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 384(%rdx) 10795; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 320(%rdx) 10796; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 256(%rdx) 10797; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 192(%rdx) 10798; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 128(%rdx) 10799; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 64(%rdx) 10800; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) 10801; AVX512F-ONLY-NEXT: vzeroupper 10802; AVX512F-ONLY-NEXT: retq 10803; 10804; AVX512DQ-LABEL: mask_replication_factor7_vf64: 10805; AVX512DQ: # %bb.0: 10806; AVX512DQ-NEXT: kmovw (%rdi), %k0 10807; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 10808; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] 10809; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm0 10810; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 10811; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 10812; AVX512DQ-NEXT: movw $1, %ax 10813; AVX512DQ-NEXT: kmovw %eax, %k1 10814; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} 10815; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 10816; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8 10817; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 10818; AVX512DQ-NEXT: vpmovm2d %k0, %zmm9 10819; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 10820; AVX512DQ-NEXT: vpmovm2d %k0, %zmm11 10821; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 10822; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] 10823; AVX512DQ-NEXT: vpermd %zmm8, %zmm13, %zmm0 10824; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] 10825; AVX512DQ-NEXT: vpermd %zmm8, %zmm15, %zmm2 10826; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] 10827; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm3 10828; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] 10829; AVX512DQ-NEXT: vpermd %zmm8, %zmm17, %zmm4 10830; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] 10831; AVX512DQ-NEXT: vpermd %zmm8, %zmm18, %zmm6 10832; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] 10833; AVX512DQ-NEXT: vpermd %zmm8, %zmm19, %zmm7 10834; AVX512DQ-NEXT: vpermd %zmm8, %zmm5, %zmm8 10835; AVX512DQ-NEXT: vpermd %zmm9, %zmm13, %zmm10 10836; AVX512DQ-NEXT: vpermd %zmm9, %zmm15, %zmm12 10837; AVX512DQ-NEXT: vpermd %zmm9, %zmm16, %zmm14 10838; AVX512DQ-NEXT: vpermd %zmm9, %zmm17, %zmm20 10839; AVX512DQ-NEXT: vpermd %zmm9, %zmm18, %zmm21 10840; AVX512DQ-NEXT: vpermd %zmm9, %zmm19, %zmm22 10841; AVX512DQ-NEXT: vpermd %zmm9, %zmm5, %zmm23 10842; AVX512DQ-NEXT: vpermd %zmm11, %zmm13, %zmm24 10843; AVX512DQ-NEXT: vpermd %zmm11, %zmm15, %zmm25 10844; AVX512DQ-NEXT: vpermd %zmm11, %zmm16, %zmm26 10845; AVX512DQ-NEXT: vpermd %zmm11, %zmm17, %zmm27 10846; AVX512DQ-NEXT: vpermd %zmm11, %zmm18, %zmm28 10847; AVX512DQ-NEXT: vpermd %zmm11, %zmm5, %zmm29 10848; AVX512DQ-NEXT: vpermd %zmm11, %zmm19, %zmm30 10849; AVX512DQ-NEXT: vpermd %zmm1, %zmm13, %zmm31 10850; AVX512DQ-NEXT: vpermd %zmm1, %zmm15, %zmm15 10851; AVX512DQ-NEXT: vpermd %zmm1, %zmm16, %zmm13 10852; AVX512DQ-NEXT: vpermd %zmm1, %zmm17, %zmm11 10853; AVX512DQ-NEXT: vpermd %zmm1, %zmm18, %zmm9 10854; AVX512DQ-NEXT: vpermd %zmm1, %zmm19, %zmm5 10855; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} 10856; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 10857; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} 10858; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 10859; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} 10860; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 10861; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} 10862; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 10863; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm13 {%k1} {z} 10864; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 10865; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm15 {%k1} {z} 10866; AVX512DQ-NEXT: vpmovd2m %zmm31, %k1 10867; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm16 {%k1} {z} 10868; AVX512DQ-NEXT: vpmovd2m %zmm29, %k1 10869; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm17 {%k1} {z} 10870; AVX512DQ-NEXT: vpmovd2m %zmm30, %k1 10871; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z} 10872; AVX512DQ-NEXT: vpmovd2m %zmm28, %k1 10873; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm19 {%k1} {z} 10874; AVX512DQ-NEXT: vpmovd2m %zmm27, %k1 10875; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm27 {%k1} {z} 10876; AVX512DQ-NEXT: vpmovd2m %zmm26, %k1 10877; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm26 {%k1} {z} 10878; AVX512DQ-NEXT: vpmovd2m %zmm25, %k1 10879; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm25 {%k1} {z} 10880; AVX512DQ-NEXT: vpmovd2m %zmm24, %k1 10881; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm24 {%k1} {z} 10882; AVX512DQ-NEXT: vpmovd2m %zmm23, %k1 10883; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm23 {%k1} {z} 10884; AVX512DQ-NEXT: vpmovd2m %zmm22, %k1 10885; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm22 {%k1} {z} 10886; AVX512DQ-NEXT: vpmovd2m %zmm21, %k1 10887; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm21 {%k1} {z} 10888; AVX512DQ-NEXT: vpmovd2m %zmm20, %k1 10889; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm20 {%k1} {z} 10890; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 10891; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm14 {%k1} {z} 10892; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 10893; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm12 {%k1} {z} 10894; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 10895; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm10 {%k1} {z} 10896; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 10897; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm8 {%k1} {z} 10898; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 10899; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm7 {%k1} {z} 10900; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 10901; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm6 {%k1} {z} 10902; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 10903; AVX512DQ-NEXT: vmovdqa32 1536(%rsi), %zmm4 {%k1} {z} 10904; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 10905; AVX512DQ-NEXT: vmovdqa32 1600(%rsi), %zmm3 {%k1} {z} 10906; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 10907; AVX512DQ-NEXT: vmovdqa32 1664(%rsi), %zmm2 {%k1} {z} 10908; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 10909; AVX512DQ-NEXT: vmovdqa32 1728(%rsi), %zmm0 {%k1} {z} 10910; AVX512DQ-NEXT: vmovdqa64 %zmm0, 1728(%rdx) 10911; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1664(%rdx) 10912; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1600(%rdx) 10913; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1536(%rdx) 10914; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1472(%rdx) 10915; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1408(%rdx) 10916; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1344(%rdx) 10917; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1280(%rdx) 10918; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1216(%rdx) 10919; AVX512DQ-NEXT: vmovdqa64 %zmm14, 1152(%rdx) 10920; AVX512DQ-NEXT: vmovdqa64 %zmm20, 1088(%rdx) 10921; AVX512DQ-NEXT: vmovdqa64 %zmm21, 1024(%rdx) 10922; AVX512DQ-NEXT: vmovdqa64 %zmm22, 960(%rdx) 10923; AVX512DQ-NEXT: vmovdqa64 %zmm23, 896(%rdx) 10924; AVX512DQ-NEXT: vmovdqa64 %zmm24, 832(%rdx) 10925; AVX512DQ-NEXT: vmovdqa64 %zmm25, 768(%rdx) 10926; AVX512DQ-NEXT: vmovdqa64 %zmm26, 704(%rdx) 10927; AVX512DQ-NEXT: vmovdqa64 %zmm27, 640(%rdx) 10928; AVX512DQ-NEXT: vmovdqa64 %zmm19, 576(%rdx) 10929; AVX512DQ-NEXT: vmovdqa64 %zmm18, 512(%rdx) 10930; AVX512DQ-NEXT: vmovdqa64 %zmm17, 448(%rdx) 10931; AVX512DQ-NEXT: vmovdqa64 %zmm16, 384(%rdx) 10932; AVX512DQ-NEXT: vmovdqa64 %zmm15, 320(%rdx) 10933; AVX512DQ-NEXT: vmovdqa64 %zmm13, 256(%rdx) 10934; AVX512DQ-NEXT: vmovdqa64 %zmm11, 192(%rdx) 10935; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rdx) 10936; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx) 10937; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) 10938; AVX512DQ-NEXT: vzeroupper 10939; AVX512DQ-NEXT: retq 10940; 10941; AVX512BW-LABEL: mask_replication_factor7_vf64: 10942; AVX512BW: # %bb.0: 10943; AVX512BW-NEXT: movw $-3, %ax 10944; AVX512BW-NEXT: kmovd %eax, %k1 10945; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 10946; AVX512BW-NEXT: kmovw (%rdi), %k0 10947; AVX512BW-NEXT: kandw %k1, %k0, %k1 10948; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 10949; AVX512BW-NEXT: kshiftrw $14, %k0, %k2 10950; AVX512BW-NEXT: korw %k2, %k1, %k1 10951; AVX512BW-NEXT: movw $-5, %ax 10952; AVX512BW-NEXT: kmovd %eax, %k2 10953; AVX512BW-NEXT: kandw %k2, %k1, %k1 10954; AVX512BW-NEXT: kmovq %k2, %k3 10955; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 10956; AVX512BW-NEXT: kshiftrw $13, %k0, %k2 10957; AVX512BW-NEXT: korw %k2, %k1, %k1 10958; AVX512BW-NEXT: movw $-9, %ax 10959; AVX512BW-NEXT: kmovd %eax, %k2 10960; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 10961; AVX512BW-NEXT: kandw %k2, %k1, %k1 10962; AVX512BW-NEXT: kshiftrw $12, %k0, %k2 10963; AVX512BW-NEXT: korw %k2, %k1, %k1 10964; AVX512BW-NEXT: movw $-17, %ax 10965; AVX512BW-NEXT: kmovd %eax, %k2 10966; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 10967; AVX512BW-NEXT: kandw %k2, %k1, %k1 10968; AVX512BW-NEXT: kshiftrw $11, %k0, %k2 10969; AVX512BW-NEXT: korw %k2, %k1, %k1 10970; AVX512BW-NEXT: movw $-33, %ax 10971; AVX512BW-NEXT: kmovd %eax, %k2 10972; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 10973; AVX512BW-NEXT: kandw %k2, %k1, %k1 10974; AVX512BW-NEXT: kshiftrw $10, %k0, %k2 10975; AVX512BW-NEXT: korw %k2, %k1, %k1 10976; AVX512BW-NEXT: movw $-65, %ax 10977; AVX512BW-NEXT: kmovd %eax, %k2 10978; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 10979; AVX512BW-NEXT: kandw %k2, %k1, %k1 10980; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 10981; AVX512BW-NEXT: korw %k0, %k1, %k0 10982; AVX512BW-NEXT: movw $-129, %ax 10983; AVX512BW-NEXT: kmovd %eax, %k1 10984; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 10985; AVX512BW-NEXT: kandw %k1, %k0, %k1 10986; AVX512BW-NEXT: kmovq (%rdi), %k4 10987; AVX512BW-NEXT: kshiftrq $1, %k4, %k0 10988; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 10989; AVX512BW-NEXT: kshiftrw $8, %k0, %k2 10990; AVX512BW-NEXT: korw %k2, %k1, %k1 10991; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF 10992; AVX512BW-NEXT: kmovd %eax, %k2 10993; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 10994; AVX512BW-NEXT: kandw %k2, %k1, %k1 10995; AVX512BW-NEXT: kshiftrw $7, %k0, %k2 10996; AVX512BW-NEXT: korw %k2, %k1, %k1 10997; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF 10998; AVX512BW-NEXT: kmovd %eax, %k2 10999; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 11000; AVX512BW-NEXT: kandw %k2, %k1, %k1 11001; AVX512BW-NEXT: kshiftrw $6, %k0, %k2 11002; AVX512BW-NEXT: korw %k2, %k1, %k1 11003; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF 11004; AVX512BW-NEXT: kmovd %eax, %k5 11005; AVX512BW-NEXT: kandw %k5, %k1, %k1 11006; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 11007; AVX512BW-NEXT: kshiftrw $5, %k0, %k2 11008; AVX512BW-NEXT: korw %k2, %k1, %k1 11009; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF 11010; AVX512BW-NEXT: kmovd %eax, %k2 11011; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 11012; AVX512BW-NEXT: kandw %k2, %k1, %k1 11013; AVX512BW-NEXT: kshiftrw $4, %k0, %k2 11014; AVX512BW-NEXT: korw %k2, %k1, %k1 11015; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF 11016; AVX512BW-NEXT: kmovd %eax, %k2 11017; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 11018; AVX512BW-NEXT: kandw %k2, %k1, %k1 11019; AVX512BW-NEXT: kshiftrw $3, %k0, %k2 11020; AVX512BW-NEXT: korw %k2, %k1, %k1 11021; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF 11022; AVX512BW-NEXT: kmovd %eax, %k2 11023; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 11024; AVX512BW-NEXT: kandw %k2, %k1, %k1 11025; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 11026; AVX512BW-NEXT: korw %k0, %k1, %k0 11027; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF 11028; AVX512BW-NEXT: kmovd %eax, %k1 11029; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 11030; AVX512BW-NEXT: kandw %k1, %k0, %k0 11031; AVX512BW-NEXT: kshiftrq $2, %k4, %k1 11032; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 11033; AVX512BW-NEXT: korw %k7, %k0, %k0 11034; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 11035; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 11036; AVX512BW-NEXT: kshiftlw $15, %k1, %k7 11037; AVX512BW-NEXT: korw %k7, %k0, %k6 11038; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z} 11039; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 11040; AVX512BW-NEXT: kandw %k2, %k1, %k0 11041; AVX512BW-NEXT: kshiftrw $14, %k7, %k1 11042; AVX512BW-NEXT: korw %k1, %k0, %k0 11043; AVX512BW-NEXT: kandw %k3, %k0, %k0 11044; AVX512BW-NEXT: kshiftrw $13, %k7, %k1 11045; AVX512BW-NEXT: korw %k1, %k0, %k0 11046; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11047; AVX512BW-NEXT: kandw %k3, %k0, %k0 11048; AVX512BW-NEXT: kshiftrw $12, %k7, %k1 11049; AVX512BW-NEXT: korw %k1, %k0, %k0 11050; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11051; AVX512BW-NEXT: kandw %k1, %k0, %k0 11052; AVX512BW-NEXT: kshiftrw $11, %k7, %k1 11053; AVX512BW-NEXT: korw %k1, %k0, %k0 11054; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11055; AVX512BW-NEXT: kandw %k1, %k0, %k1 11056; AVX512BW-NEXT: kmovq %k4, %k7 11057; AVX512BW-NEXT: kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 11058; AVX512BW-NEXT: kshiftrq $3, %k4, %k0 11059; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 11060; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 11061; AVX512BW-NEXT: korw %k6, %k1, %k1 11062; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11063; AVX512BW-NEXT: kandw %k4, %k1, %k1 11064; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 11065; AVX512BW-NEXT: korw %k6, %k1, %k1 11066; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11067; AVX512BW-NEXT: kandw %k6, %k1, %k1 11068; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 11069; AVX512BW-NEXT: korw %k6, %k1, %k1 11070; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11071; AVX512BW-NEXT: kandw %k4, %k1, %k1 11072; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 11073; AVX512BW-NEXT: korw %k6, %k1, %k1 11074; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11075; AVX512BW-NEXT: kandw %k4, %k1, %k1 11076; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 11077; AVX512BW-NEXT: korw %k6, %k1, %k1 11078; AVX512BW-NEXT: kandw %k5, %k1, %k1 11079; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 11080; AVX512BW-NEXT: korw %k6, %k1, %k1 11081; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 11082; AVX512BW-NEXT: kandw %k5, %k1, %k1 11083; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 11084; AVX512BW-NEXT: korw %k0, %k1, %k0 11085; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11086; AVX512BW-NEXT: kandw %k1, %k0, %k1 11087; AVX512BW-NEXT: kshiftrq $4, %k7, %k6 11088; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 11089; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 11090; AVX512BW-NEXT: korw %k7, %k1, %k1 11091; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 11092; AVX512BW-NEXT: kandw %k7, %k1, %k1 11093; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 11094; AVX512BW-NEXT: korw %k7, %k1, %k1 11095; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 11096; AVX512BW-NEXT: kandw %k7, %k1, %k1 11097; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 11098; AVX512BW-NEXT: korw %k7, %k1, %k1 11099; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 11100; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 11101; AVX512BW-NEXT: korw %k0, %k1, %k1 11102; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} 11103; AVX512BW-NEXT: kandw %k2, %k6, %k1 11104; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 11105; AVX512BW-NEXT: korw %k6, %k1, %k1 11106; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 11107; AVX512BW-NEXT: kandw %k2, %k1, %k1 11108; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 11109; AVX512BW-NEXT: korw %k0, %k1, %k0 11110; AVX512BW-NEXT: kandw %k3, %k0, %k1 11111; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload 11112; AVX512BW-NEXT: kshiftrq $5, %k7, %k0 11113; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 11114; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 11115; AVX512BW-NEXT: korw %k6, %k1, %k1 11116; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11117; AVX512BW-NEXT: kandw %k3, %k1, %k1 11118; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 11119; AVX512BW-NEXT: korw %k6, %k1, %k1 11120; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11121; AVX512BW-NEXT: kandw %k3, %k1, %k1 11122; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 11123; AVX512BW-NEXT: korw %k6, %k1, %k1 11124; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11125; AVX512BW-NEXT: kandw %k3, %k1, %k1 11126; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 11127; AVX512BW-NEXT: korw %k6, %k1, %k1 11128; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11129; AVX512BW-NEXT: kandw %k3, %k1, %k1 11130; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 11131; AVX512BW-NEXT: korw %k6, %k1, %k1 11132; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11133; AVX512BW-NEXT: kandw %k6, %k1, %k1 11134; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 11135; AVX512BW-NEXT: korw %k6, %k1, %k1 11136; AVX512BW-NEXT: kandw %k4, %k1, %k1 11137; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 11138; AVX512BW-NEXT: korw %k0, %k1, %k0 11139; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11140; AVX512BW-NEXT: kandw %k1, %k0, %k6 11141; AVX512BW-NEXT: kshiftrq $6, %k7, %k0 11142; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 11143; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 11144; AVX512BW-NEXT: korw %k7, %k6, %k6 11145; AVX512BW-NEXT: kandw %k5, %k6, %k6 11146; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 11147; AVX512BW-NEXT: korw %k7, %k6, %k6 11148; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11149; AVX512BW-NEXT: kandw %k4, %k6, %k6 11150; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 11151; AVX512BW-NEXT: korw %k7, %k6, %k6 11152; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11153; AVX512BW-NEXT: kandw %k4, %k6, %k6 11154; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 11155; AVX512BW-NEXT: korw %k7, %k6, %k6 11156; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11157; AVX512BW-NEXT: kandw %k4, %k6, %k6 11158; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 11159; AVX512BW-NEXT: korw %k7, %k6, %k6 11160; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 11161; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 11162; AVX512BW-NEXT: korw %k1, %k6, %k1 11163; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} 11164; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11165; AVX512BW-NEXT: kandw %k1, %k0, %k1 11166; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload 11167; AVX512BW-NEXT: kshiftrq $7, %k4, %k0 11168; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 11169; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 11170; AVX512BW-NEXT: korw %k6, %k1, %k1 11171; AVX512BW-NEXT: kandw %k2, %k1, %k1 11172; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 11173; AVX512BW-NEXT: korw %k6, %k1, %k1 11174; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 11175; AVX512BW-NEXT: kandw %k2, %k1, %k1 11176; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 11177; AVX512BW-NEXT: korw %k6, %k1, %k1 11178; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 11179; AVX512BW-NEXT: kandw %k5, %k1, %k1 11180; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 11181; AVX512BW-NEXT: korw %k6, %k1, %k1 11182; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 11183; AVX512BW-NEXT: kandw %k5, %k1, %k1 11184; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 11185; AVX512BW-NEXT: korw %k6, %k1, %k1 11186; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11187; AVX512BW-NEXT: kandw %k6, %k1, %k1 11188; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 11189; AVX512BW-NEXT: korw %k6, %k1, %k1 11190; AVX512BW-NEXT: kandw %k3, %k1, %k1 11191; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 11192; AVX512BW-NEXT: korw %k0, %k1, %k0 11193; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11194; AVX512BW-NEXT: kandw %k1, %k0, %k1 11195; AVX512BW-NEXT: kshiftrq $8, %k4, %k0 11196; AVX512BW-NEXT: kmovq %k4, %k5 11197; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 11198; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 11199; AVX512BW-NEXT: korw %k7, %k1, %k1 11200; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11201; AVX512BW-NEXT: kandw %k3, %k1, %k1 11202; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 11203; AVX512BW-NEXT: korw %k7, %k1, %k1 11204; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11205; AVX512BW-NEXT: kandw %k4, %k1, %k1 11206; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 11207; AVX512BW-NEXT: korw %k7, %k1, %k1 11208; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11209; AVX512BW-NEXT: kandw %k3, %k1, %k1 11210; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 11211; AVX512BW-NEXT: korw %k7, %k1, %k1 11212; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11213; AVX512BW-NEXT: kandw %k3, %k1, %k1 11214; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 11215; AVX512BW-NEXT: korw %k7, %k1, %k1 11216; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11217; AVX512BW-NEXT: kandw %k3, %k1, %k1 11218; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 11219; AVX512BW-NEXT: korw %k6, %k1, %k1 11220; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11221; AVX512BW-NEXT: kandw %k6, %k1, %k1 11222; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 11223; AVX512BW-NEXT: korw %k0, %k1, %k0 11224; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 11225; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 11226; AVX512BW-NEXT: kshiftrq $9, %k5, %k1 11227; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 11228; AVX512BW-NEXT: korw %k6, %k0, %k7 11229; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k7} {z} 11230; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 11231; AVX512BW-NEXT: kandw %k0, %k1, %k0 11232; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 11233; AVX512BW-NEXT: korw %k1, %k0, %k0 11234; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11235; AVX512BW-NEXT: kandw %k1, %k0, %k0 11236; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 11237; AVX512BW-NEXT: korw %k1, %k0, %k0 11238; AVX512BW-NEXT: kandw %k2, %k0, %k0 11239; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 11240; AVX512BW-NEXT: korw %k1, %k0, %k0 11241; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 11242; AVX512BW-NEXT: kandw %k2, %k0, %k0 11243; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 11244; AVX512BW-NEXT: korw %k1, %k0, %k0 11245; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11246; AVX512BW-NEXT: kandw %k1, %k0, %k0 11247; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 11248; AVX512BW-NEXT: korw %k1, %k0, %k0 11249; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11250; AVX512BW-NEXT: kandw %k1, %k0, %k1 11251; AVX512BW-NEXT: kshiftrq $10, %k5, %k0 11252; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 11253; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 11254; AVX512BW-NEXT: korw %k6, %k1, %k1 11255; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11256; AVX512BW-NEXT: kandw %k6, %k1, %k1 11257; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 11258; AVX512BW-NEXT: korw %k6, %k1, %k1 11259; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11260; AVX512BW-NEXT: kandw %k6, %k1, %k1 11261; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 11262; AVX512BW-NEXT: korw %k6, %k1, %k1 11263; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11264; AVX512BW-NEXT: kandw %k6, %k1, %k1 11265; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 11266; AVX512BW-NEXT: korw %k6, %k1, %k1 11267; AVX512BW-NEXT: kandw %k4, %k1, %k1 11268; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 11269; AVX512BW-NEXT: korw %k6, %k1, %k1 11270; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11271; AVX512BW-NEXT: kandw %k4, %k1, %k1 11272; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 11273; AVX512BW-NEXT: korw %k6, %k1, %k1 11274; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11275; AVX512BW-NEXT: kandw %k4, %k1, %k1 11276; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 11277; AVX512BW-NEXT: korw %k0, %k1, %k0 11278; AVX512BW-NEXT: kandw %k3, %k0, %k1 11279; AVX512BW-NEXT: kshiftrq $11, %k5, %k6 11280; AVX512BW-NEXT: kmovq %k5, %k4 11281; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 11282; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 11283; AVX512BW-NEXT: korw %k7, %k1, %k1 11284; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 11285; AVX512BW-NEXT: kandw %k5, %k1, %k1 11286; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 11287; AVX512BW-NEXT: korw %k7, %k1, %k1 11288; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 11289; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 11290; AVX512BW-NEXT: korw %k0, %k1, %k1 11291; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} 11292; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11293; AVX512BW-NEXT: kandw %k1, %k6, %k1 11294; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 11295; AVX512BW-NEXT: korw %k6, %k1, %k1 11296; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11297; AVX512BW-NEXT: kandw %k3, %k1, %k1 11298; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 11299; AVX512BW-NEXT: korw %k6, %k1, %k1 11300; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11301; AVX512BW-NEXT: kandw %k3, %k1, %k1 11302; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 11303; AVX512BW-NEXT: korw %k0, %k1, %k0 11304; AVX512BW-NEXT: kandw %k2, %k0, %k1 11305; AVX512BW-NEXT: kmovq %k4, %k7 11306; AVX512BW-NEXT: kshiftrq $12, %k4, %k0 11307; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 11308; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 11309; AVX512BW-NEXT: korw %k6, %k1, %k1 11310; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 11311; AVX512BW-NEXT: kandw %k2, %k1, %k1 11312; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 11313; AVX512BW-NEXT: korw %k6, %k1, %k1 11314; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11315; AVX512BW-NEXT: kandw %k3, %k1, %k1 11316; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 11317; AVX512BW-NEXT: korw %k6, %k1, %k1 11318; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11319; AVX512BW-NEXT: kandw %k3, %k1, %k1 11320; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 11321; AVX512BW-NEXT: korw %k6, %k1, %k1 11322; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11323; AVX512BW-NEXT: kandw %k4, %k1, %k1 11324; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 11325; AVX512BW-NEXT: korw %k6, %k1, %k1 11326; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11327; AVX512BW-NEXT: kandw %k3, %k1, %k1 11328; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 11329; AVX512BW-NEXT: korw %k6, %k1, %k1 11330; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11331; AVX512BW-NEXT: kandw %k6, %k1, %k1 11332; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 11333; AVX512BW-NEXT: korw %k0, %k1, %k0 11334; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11335; AVX512BW-NEXT: kandw %k1, %k0, %k6 11336; AVX512BW-NEXT: kshiftrq $13, %k7, %k0 11337; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 11338; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 11339; AVX512BW-NEXT: korw %k7, %k6, %k6 11340; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 11341; AVX512BW-NEXT: kandw %k7, %k6, %k6 11342; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 11343; AVX512BW-NEXT: korw %k7, %k6, %k6 11344; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 11345; AVX512BW-NEXT: kandw %k7, %k6, %k6 11346; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 11347; AVX512BW-NEXT: korw %k7, %k6, %k6 11348; AVX512BW-NEXT: kandw %k5, %k6, %k6 11349; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 11350; AVX512BW-NEXT: korw %k7, %k6, %k6 11351; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 11352; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 11353; AVX512BW-NEXT: korw %k1, %k6, %k6 11354; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k6} {z} 11355; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 11356; AVX512BW-NEXT: kandw %k5, %k0, %k0 11357; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 11358; AVX512BW-NEXT: korw %k1, %k0, %k0 11359; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11360; AVX512BW-NEXT: kandw %k1, %k0, %k1 11361; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload 11362; AVX512BW-NEXT: kshiftrq $14, %k5, %k0 11363; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 11364; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 11365; AVX512BW-NEXT: korw %k6, %k1, %k1 11366; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11367; AVX512BW-NEXT: kandw %k6, %k1, %k1 11368; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 11369; AVX512BW-NEXT: korw %k6, %k1, %k1 11370; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11371; AVX512BW-NEXT: kandw %k6, %k1, %k1 11372; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 11373; AVX512BW-NEXT: korw %k6, %k1, %k1 11374; AVX512BW-NEXT: kandw %k2, %k1, %k1 11375; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 11376; AVX512BW-NEXT: korw %k6, %k1, %k1 11377; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 11378; AVX512BW-NEXT: kandw %k2, %k1, %k1 11379; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 11380; AVX512BW-NEXT: korw %k6, %k1, %k1 11381; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11382; AVX512BW-NEXT: kandw %k6, %k1, %k1 11383; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 11384; AVX512BW-NEXT: korw %k6, %k1, %k1 11385; AVX512BW-NEXT: kandw %k4, %k1, %k1 11386; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 11387; AVX512BW-NEXT: korw %k0, %k1, %k0 11388; AVX512BW-NEXT: kandw %k3, %k0, %k6 11389; AVX512BW-NEXT: kshiftrq $15, %k5, %k1 11390; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 11391; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 11392; AVX512BW-NEXT: korw %k7, %k6, %k6 11393; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11394; AVX512BW-NEXT: kandw %k3, %k6, %k6 11395; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 11396; AVX512BW-NEXT: korw %k7, %k6, %k6 11397; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11398; AVX512BW-NEXT: kandw %k3, %k6, %k6 11399; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 11400; AVX512BW-NEXT: korw %k7, %k6, %k6 11401; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11402; AVX512BW-NEXT: kandw %k4, %k6, %k6 11403; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 11404; AVX512BW-NEXT: korw %k7, %k6, %k6 11405; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11406; AVX512BW-NEXT: kandw %k4, %k6, %k6 11407; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 11408; AVX512BW-NEXT: korw %k7, %k6, %k6 11409; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11410; AVX512BW-NEXT: kandw %k4, %k6, %k6 11411; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 11412; AVX512BW-NEXT: korw %k1, %k6, %k1 11413; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 11414; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 11415; AVX512BW-NEXT: korw %k0, %k1, %k1 11416; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} 11417; AVX512BW-NEXT: kshiftrq $16, %k5, %k0 11418; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11419; AVX512BW-NEXT: kandw %k1, %k0, %k1 11420; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 11421; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 11422; AVX512BW-NEXT: korw %k6, %k1, %k1 11423; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11424; AVX512BW-NEXT: kandw %k6, %k1, %k1 11425; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 11426; AVX512BW-NEXT: korw %k6, %k1, %k1 11427; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11428; AVX512BW-NEXT: kandw %k6, %k1, %k1 11429; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 11430; AVX512BW-NEXT: korw %k6, %k1, %k1 11431; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11432; AVX512BW-NEXT: kandw %k6, %k1, %k1 11433; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 11434; AVX512BW-NEXT: korw %k6, %k1, %k1 11435; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11436; AVX512BW-NEXT: kandw %k6, %k1, %k1 11437; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 11438; AVX512BW-NEXT: korw %k6, %k1, %k1 11439; AVX512BW-NEXT: kandw %k2, %k1, %k1 11440; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 11441; AVX512BW-NEXT: korw %k0, %k1, %k0 11442; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 11443; AVX512BW-NEXT: kandw %k2, %k0, %k1 11444; AVX512BW-NEXT: kshiftrq $17, %k5, %k0 11445; AVX512BW-NEXT: kmovq %k5, %k7 11446; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 11447; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 11448; AVX512BW-NEXT: korw %k6, %k1, %k1 11449; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 11450; AVX512BW-NEXT: kandw %k5, %k1, %k1 11451; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 11452; AVX512BW-NEXT: korw %k6, %k1, %k1 11453; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 11454; AVX512BW-NEXT: kandw %k5, %k1, %k1 11455; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 11456; AVX512BW-NEXT: korw %k6, %k1, %k1 11457; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 11458; AVX512BW-NEXT: kandw %k5, %k1, %k1 11459; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 11460; AVX512BW-NEXT: korw %k6, %k1, %k1 11461; AVX512BW-NEXT: kandw %k3, %k1, %k1 11462; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 11463; AVX512BW-NEXT: korw %k6, %k1, %k1 11464; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11465; AVX512BW-NEXT: kandw %k3, %k1, %k1 11466; AVX512BW-NEXT: kshiftrw $3, %k0, %k6 11467; AVX512BW-NEXT: korw %k6, %k1, %k1 11468; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11469; AVX512BW-NEXT: kandw %k3, %k1, %k1 11470; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 11471; AVX512BW-NEXT: korw %k0, %k1, %k0 11472; AVX512BW-NEXT: kandw %k4, %k0, %k0 11473; AVX512BW-NEXT: kshiftrq $18, %k7, %k1 11474; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 11475; AVX512BW-NEXT: korw %k6, %k0, %k0 11476; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 11477; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 11478; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 11479; AVX512BW-NEXT: korw %k6, %k0, %k7 11480; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k7} {z} 11481; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 11482; AVX512BW-NEXT: kandw %k5, %k1, %k0 11483; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 11484; AVX512BW-NEXT: korw %k1, %k0, %k0 11485; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11486; AVX512BW-NEXT: kandw %k4, %k0, %k0 11487; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 11488; AVX512BW-NEXT: korw %k1, %k0, %k0 11489; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11490; AVX512BW-NEXT: kandw %k1, %k0, %k0 11491; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 11492; AVX512BW-NEXT: korw %k1, %k0, %k0 11493; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11494; AVX512BW-NEXT: kandw %k3, %k0, %k0 11495; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 11496; AVX512BW-NEXT: korw %k1, %k0, %k0 11497; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11498; AVX512BW-NEXT: kandw %k1, %k0, %k1 11499; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload 11500; AVX512BW-NEXT: kshiftrq $19, %k7, %k0 11501; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 11502; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 11503; AVX512BW-NEXT: korw %k6, %k1, %k1 11504; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11505; AVX512BW-NEXT: kandw %k6, %k1, %k1 11506; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 11507; AVX512BW-NEXT: korw %k6, %k1, %k1 11508; AVX512BW-NEXT: kandw %k2, %k1, %k1 11509; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 11510; AVX512BW-NEXT: korw %k6, %k1, %k1 11511; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 11512; AVX512BW-NEXT: kandw %k2, %k1, %k1 11513; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 11514; AVX512BW-NEXT: korw %k6, %k1, %k1 11515; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11516; AVX512BW-NEXT: kandw %k6, %k1, %k1 11517; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 11518; AVX512BW-NEXT: korw %k6, %k1, %k1 11519; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11520; AVX512BW-NEXT: kandw %k6, %k1, %k1 11521; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 11522; AVX512BW-NEXT: korw %k6, %k1, %k1 11523; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11524; AVX512BW-NEXT: kandw %k6, %k1, %k1 11525; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 11526; AVX512BW-NEXT: korw %k0, %k1, %k0 11527; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11528; AVX512BW-NEXT: kandw %k1, %k0, %k1 11529; AVX512BW-NEXT: kshiftrq $20, %k7, %k6 11530; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 11531; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 11532; AVX512BW-NEXT: korw %k7, %k1, %k1 11533; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 11534; AVX512BW-NEXT: kandw %k7, %k1, %k1 11535; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 11536; AVX512BW-NEXT: korw %k7, %k1, %k1 11537; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 11538; AVX512BW-NEXT: kandw %k7, %k1, %k1 11539; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 11540; AVX512BW-NEXT: korw %k7, %k1, %k1 11541; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 11542; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 11543; AVX512BW-NEXT: korw %k0, %k1, %k1 11544; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} 11545; AVX512BW-NEXT: kandw %k5, %k6, %k1 11546; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 11547; AVX512BW-NEXT: korw %k6, %k1, %k1 11548; AVX512BW-NEXT: kandw %k4, %k1, %k1 11549; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 11550; AVX512BW-NEXT: korw %k0, %k1, %k0 11551; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11552; AVX512BW-NEXT: kandw %k1, %k0, %k1 11553; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload 11554; AVX512BW-NEXT: kshiftrq $21, %k7, %k0 11555; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 11556; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 11557; AVX512BW-NEXT: korw %k6, %k1, %k1 11558; AVX512BW-NEXT: kandw %k3, %k1, %k1 11559; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 11560; AVX512BW-NEXT: korw %k6, %k1, %k1 11561; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11562; AVX512BW-NEXT: kandw %k3, %k1, %k1 11563; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 11564; AVX512BW-NEXT: korw %k6, %k1, %k1 11565; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11566; AVX512BW-NEXT: kandw %k3, %k1, %k1 11567; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 11568; AVX512BW-NEXT: korw %k6, %k1, %k1 11569; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11570; AVX512BW-NEXT: kandw %k4, %k1, %k1 11571; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 11572; AVX512BW-NEXT: korw %k6, %k1, %k1 11573; AVX512BW-NEXT: kandw %k2, %k1, %k1 11574; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 11575; AVX512BW-NEXT: korw %k6, %k1, %k1 11576; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 11577; AVX512BW-NEXT: kandw %k2, %k1, %k1 11578; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 11579; AVX512BW-NEXT: korw %k0, %k1, %k0 11580; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11581; AVX512BW-NEXT: kandw %k1, %k0, %k6 11582; AVX512BW-NEXT: kshiftrq $22, %k7, %k0 11583; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 11584; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 11585; AVX512BW-NEXT: korw %k7, %k6, %k6 11586; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11587; AVX512BW-NEXT: kandw %k3, %k6, %k6 11588; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 11589; AVX512BW-NEXT: korw %k7, %k6, %k6 11590; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11591; AVX512BW-NEXT: kandw %k3, %k6, %k6 11592; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 11593; AVX512BW-NEXT: korw %k7, %k6, %k6 11594; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11595; AVX512BW-NEXT: kandw %k3, %k6, %k6 11596; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 11597; AVX512BW-NEXT: korw %k7, %k6, %k6 11598; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11599; AVX512BW-NEXT: kandw %k3, %k6, %k6 11600; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 11601; AVX512BW-NEXT: korw %k7, %k6, %k6 11602; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 11603; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 11604; AVX512BW-NEXT: korw %k1, %k6, %k1 11605; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} 11606; AVX512BW-NEXT: kandw %k5, %k0, %k1 11607; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload 11608; AVX512BW-NEXT: kshiftrq $23, %k7, %k0 11609; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 11610; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 11611; AVX512BW-NEXT: korw %k6, %k1, %k1 11612; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11613; AVX512BW-NEXT: kandw %k3, %k1, %k1 11614; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 11615; AVX512BW-NEXT: korw %k6, %k1, %k1 11616; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11617; AVX512BW-NEXT: kandw %k3, %k1, %k1 11618; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 11619; AVX512BW-NEXT: korw %k6, %k1, %k1 11620; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11621; AVX512BW-NEXT: kandw %k6, %k1, %k1 11622; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 11623; AVX512BW-NEXT: korw %k6, %k1, %k1 11624; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11625; AVX512BW-NEXT: kandw %k6, %k1, %k1 11626; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 11627; AVX512BW-NEXT: korw %k6, %k1, %k1 11628; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11629; AVX512BW-NEXT: kandw %k6, %k1, %k1 11630; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 11631; AVX512BW-NEXT: korw %k6, %k1, %k1 11632; AVX512BW-NEXT: kandw %k4, %k1, %k1 11633; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 11634; AVX512BW-NEXT: korw %k0, %k1, %k0 11635; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11636; AVX512BW-NEXT: kandw %k1, %k0, %k1 11637; AVX512BW-NEXT: kshiftrq $24, %k7, %k0 11638; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 11639; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 11640; AVX512BW-NEXT: korw %k7, %k1, %k1 11641; AVX512BW-NEXT: kandw %k2, %k1, %k1 11642; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 11643; AVX512BW-NEXT: korw %k7, %k1, %k1 11644; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 11645; AVX512BW-NEXT: kandw %k2, %k1, %k1 11646; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 11647; AVX512BW-NEXT: korw %k7, %k1, %k1 11648; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11649; AVX512BW-NEXT: kandw %k4, %k1, %k1 11650; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 11651; AVX512BW-NEXT: korw %k7, %k1, %k1 11652; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11653; AVX512BW-NEXT: kandw %k4, %k1, %k1 11654; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 11655; AVX512BW-NEXT: korw %k7, %k1, %k1 11656; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11657; AVX512BW-NEXT: kandw %k4, %k1, %k1 11658; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 11659; AVX512BW-NEXT: korw %k6, %k1, %k1 11660; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11661; AVX512BW-NEXT: kandw %k6, %k1, %k1 11662; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 11663; AVX512BW-NEXT: korw %k0, %k1, %k0 11664; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 11665; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 11666; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload 11667; AVX512BW-NEXT: kshiftrq $25, %k2, %k1 11668; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 11669; AVX512BW-NEXT: korw %k6, %k0, %k7 11670; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k7} {z} 11671; AVX512BW-NEXT: kandw %k5, %k1, %k0 11672; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 11673; AVX512BW-NEXT: korw %k1, %k0, %k0 11674; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11675; AVX512BW-NEXT: kandw %k1, %k0, %k0 11676; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 11677; AVX512BW-NEXT: korw %k1, %k0, %k0 11678; AVX512BW-NEXT: kandw %k3, %k0, %k0 11679; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 11680; AVX512BW-NEXT: korw %k1, %k0, %k0 11681; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11682; AVX512BW-NEXT: kandw %k1, %k0, %k0 11683; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 11684; AVX512BW-NEXT: korw %k1, %k0, %k0 11685; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11686; AVX512BW-NEXT: kandw %k3, %k0, %k0 11687; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 11688; AVX512BW-NEXT: korw %k1, %k0, %k0 11689; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11690; AVX512BW-NEXT: kandw %k1, %k0, %k1 11691; AVX512BW-NEXT: kmovq %k2, %k7 11692; AVX512BW-NEXT: kshiftrq $26, %k2, %k0 11693; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 11694; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 11695; AVX512BW-NEXT: korw %k6, %k1, %k1 11696; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 11697; AVX512BW-NEXT: kandw %k5, %k1, %k1 11698; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 11699; AVX512BW-NEXT: korw %k6, %k1, %k1 11700; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 11701; AVX512BW-NEXT: kandw %k5, %k1, %k1 11702; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 11703; AVX512BW-NEXT: korw %k6, %k1, %k1 11704; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 11705; AVX512BW-NEXT: kandw %k5, %k1, %k1 11706; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 11707; AVX512BW-NEXT: korw %k6, %k1, %k1 11708; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 11709; AVX512BW-NEXT: kandw %k2, %k1, %k1 11710; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 11711; AVX512BW-NEXT: korw %k6, %k1, %k1 11712; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 11713; AVX512BW-NEXT: kandw %k2, %k1, %k1 11714; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 11715; AVX512BW-NEXT: korw %k6, %k1, %k1 11716; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 11717; AVX512BW-NEXT: kandw %k5, %k1, %k1 11718; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 11719; AVX512BW-NEXT: korw %k0, %k1, %k0 11720; AVX512BW-NEXT: kandw %k4, %k0, %k1 11721; AVX512BW-NEXT: kshiftrq $27, %k7, %k6 11722; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 11723; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 11724; AVX512BW-NEXT: korw %k7, %k1, %k1 11725; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11726; AVX512BW-NEXT: kandw %k4, %k1, %k1 11727; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 11728; AVX512BW-NEXT: korw %k7, %k1, %k1 11729; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 11730; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 11731; AVX512BW-NEXT: korw %k0, %k1, %k1 11732; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z} 11733; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11734; AVX512BW-NEXT: kandw %k1, %k6, %k1 11735; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 11736; AVX512BW-NEXT: korw %k6, %k1, %k1 11737; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11738; AVX512BW-NEXT: kandw %k4, %k1, %k1 11739; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 11740; AVX512BW-NEXT: korw %k6, %k1, %k1 11741; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11742; AVX512BW-NEXT: kandw %k4, %k1, %k1 11743; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 11744; AVX512BW-NEXT: korw %k0, %k1, %k0 11745; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11746; AVX512BW-NEXT: kandw %k4, %k0, %k1 11747; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload 11748; AVX512BW-NEXT: kshiftrq $28, %k7, %k0 11749; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 11750; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 11751; AVX512BW-NEXT: korw %k6, %k1, %k1 11752; AVX512BW-NEXT: kandw %k3, %k1, %k1 11753; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 11754; AVX512BW-NEXT: korw %k6, %k1, %k1 11755; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11756; AVX512BW-NEXT: kandw %k3, %k1, %k1 11757; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 11758; AVX512BW-NEXT: korw %k6, %k1, %k1 11759; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11760; AVX512BW-NEXT: kandw %k3, %k1, %k1 11761; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 11762; AVX512BW-NEXT: korw %k6, %k1, %k1 11763; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11764; AVX512BW-NEXT: kandw %k3, %k1, %k1 11765; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 11766; AVX512BW-NEXT: korw %k6, %k1, %k1 11767; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11768; AVX512BW-NEXT: kandw %k3, %k1, %k1 11769; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 11770; AVX512BW-NEXT: korw %k6, %k1, %k1 11771; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11772; AVX512BW-NEXT: kandw %k3, %k1, %k1 11773; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 11774; AVX512BW-NEXT: korw %k0, %k1, %k0 11775; AVX512BW-NEXT: kandw %k2, %k0, %k6 11776; AVX512BW-NEXT: kshiftrq $29, %k7, %k0 11777; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 11778; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 11779; AVX512BW-NEXT: korw %k7, %k6, %k6 11780; AVX512BW-NEXT: kandw %k5, %k6, %k6 11781; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 11782; AVX512BW-NEXT: korw %k7, %k6, %k6 11783; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11784; AVX512BW-NEXT: kandw %k3, %k6, %k6 11785; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 11786; AVX512BW-NEXT: korw %k7, %k6, %k6 11787; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 11788; AVX512BW-NEXT: kandw %k2, %k6, %k6 11789; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 11790; AVX512BW-NEXT: korw %k7, %k6, %k6 11791; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 11792; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 11793; AVX512BW-NEXT: korw %k1, %k6, %k6 11794; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k6} {z} 11795; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 11796; AVX512BW-NEXT: kandw %k2, %k0, %k0 11797; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 11798; AVX512BW-NEXT: korw %k1, %k0, %k0 11799; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11800; AVX512BW-NEXT: kandw %k1, %k0, %k1 11801; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload 11802; AVX512BW-NEXT: kshiftrq $30, %k5, %k0 11803; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 11804; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 11805; AVX512BW-NEXT: korw %k6, %k1, %k1 11806; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11807; AVX512BW-NEXT: kandw %k6, %k1, %k1 11808; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 11809; AVX512BW-NEXT: korw %k6, %k1, %k1 11810; AVX512BW-NEXT: kandw %k4, %k1, %k1 11811; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 11812; AVX512BW-NEXT: korw %k6, %k1, %k1 11813; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11814; AVX512BW-NEXT: kandw %k4, %k1, %k1 11815; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 11816; AVX512BW-NEXT: korw %k6, %k1, %k1 11817; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11818; AVX512BW-NEXT: kandw %k4, %k1, %k1 11819; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 11820; AVX512BW-NEXT: korw %k6, %k1, %k1 11821; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11822; AVX512BW-NEXT: kandw %k6, %k1, %k1 11823; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 11824; AVX512BW-NEXT: korw %k6, %k1, %k1 11825; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11826; AVX512BW-NEXT: kandw %k6, %k1, %k1 11827; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 11828; AVX512BW-NEXT: korw %k0, %k1, %k0 11829; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11830; AVX512BW-NEXT: kandw %k1, %k0, %k6 11831; AVX512BW-NEXT: kshiftrq $31, %k5, %k1 11832; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 11833; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 11834; AVX512BW-NEXT: korw %k7, %k6, %k6 11835; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 11836; AVX512BW-NEXT: kandw %k7, %k6, %k6 11837; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 11838; AVX512BW-NEXT: korw %k7, %k6, %k6 11839; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 11840; AVX512BW-NEXT: kandw %k7, %k6, %k6 11841; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 11842; AVX512BW-NEXT: korw %k7, %k6, %k6 11843; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 11844; AVX512BW-NEXT: kandw %k7, %k6, %k6 11845; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 11846; AVX512BW-NEXT: korw %k7, %k6, %k6 11847; AVX512BW-NEXT: kandw %k3, %k6, %k6 11848; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 11849; AVX512BW-NEXT: korw %k7, %k6, %k6 11850; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11851; AVX512BW-NEXT: kandw %k3, %k6, %k6 11852; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 11853; AVX512BW-NEXT: korw %k1, %k6, %k1 11854; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 11855; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 11856; AVX512BW-NEXT: korw %k0, %k1, %k1 11857; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k1} {z} 11858; AVX512BW-NEXT: kshiftrq $32, %k5, %k0 11859; AVX512BW-NEXT: kandw %k2, %k0, %k1 11860; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 11861; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 11862; AVX512BW-NEXT: korw %k6, %k1, %k1 11863; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 11864; AVX512BW-NEXT: kandw %k2, %k1, %k1 11865; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 11866; AVX512BW-NEXT: korw %k6, %k1, %k1 11867; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11868; AVX512BW-NEXT: kandw %k6, %k1, %k1 11869; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 11870; AVX512BW-NEXT: korw %k6, %k1, %k1 11871; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11872; AVX512BW-NEXT: kandw %k6, %k1, %k1 11873; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 11874; AVX512BW-NEXT: korw %k6, %k1, %k1 11875; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11876; AVX512BW-NEXT: kandw %k6, %k1, %k1 11877; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 11878; AVX512BW-NEXT: korw %k6, %k1, %k1 11879; AVX512BW-NEXT: kandw %k4, %k1, %k1 11880; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 11881; AVX512BW-NEXT: korw %k0, %k1, %k0 11882; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11883; AVX512BW-NEXT: kandw %k1, %k0, %k1 11884; AVX512BW-NEXT: kshiftrq $33, %k5, %k0 11885; AVX512BW-NEXT: kmovq %k5, %k7 11886; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 11887; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 11888; AVX512BW-NEXT: korw %k6, %k1, %k1 11889; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11890; AVX512BW-NEXT: kandw %k4, %k1, %k1 11891; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 11892; AVX512BW-NEXT: korw %k6, %k1, %k1 11893; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 11894; AVX512BW-NEXT: kandw %k5, %k1, %k1 11895; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 11896; AVX512BW-NEXT: korw %k6, %k1, %k1 11897; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11898; AVX512BW-NEXT: kandw %k4, %k1, %k1 11899; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 11900; AVX512BW-NEXT: korw %k6, %k1, %k1 11901; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11902; AVX512BW-NEXT: kandw %k4, %k1, %k1 11903; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 11904; AVX512BW-NEXT: korw %k6, %k1, %k1 11905; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11906; AVX512BW-NEXT: kandw %k4, %k1, %k1 11907; AVX512BW-NEXT: kshiftrw $3, %k0, %k6 11908; AVX512BW-NEXT: korw %k6, %k1, %k1 11909; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11910; AVX512BW-NEXT: kandw %k4, %k1, %k1 11911; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 11912; AVX512BW-NEXT: korw %k0, %k1, %k0 11913; AVX512BW-NEXT: kandw %k3, %k0, %k0 11914; AVX512BW-NEXT: kmovq %k7, %k3 11915; AVX512BW-NEXT: kshiftrq $34, %k7, %k1 11916; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 11917; AVX512BW-NEXT: korw %k6, %k0, %k0 11918; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 11919; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 11920; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 11921; AVX512BW-NEXT: korw %k6, %k0, %k7 11922; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k7} {z} 11923; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 11924; AVX512BW-NEXT: kandw %k0, %k1, %k0 11925; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 11926; AVX512BW-NEXT: korw %k1, %k0, %k0 11927; AVX512BW-NEXT: kandw %k2, %k0, %k0 11928; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 11929; AVX512BW-NEXT: korw %k1, %k0, %k0 11930; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 11931; AVX512BW-NEXT: kandw %k2, %k0, %k0 11932; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 11933; AVX512BW-NEXT: korw %k1, %k0, %k0 11934; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11935; AVX512BW-NEXT: kandw %k1, %k0, %k0 11936; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 11937; AVX512BW-NEXT: korw %k1, %k0, %k0 11938; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11939; AVX512BW-NEXT: kandw %k1, %k0, %k1 11940; AVX512BW-NEXT: kshiftrq $35, %k3, %k0 11941; AVX512BW-NEXT: kmovq %k3, %k7 11942; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 11943; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 11944; AVX512BW-NEXT: korw %k6, %k1, %k1 11945; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11946; AVX512BW-NEXT: kandw %k3, %k1, %k1 11947; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 11948; AVX512BW-NEXT: korw %k6, %k1, %k1 11949; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 11950; AVX512BW-NEXT: kandw %k4, %k1, %k1 11951; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 11952; AVX512BW-NEXT: korw %k6, %k1, %k1 11953; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 11954; AVX512BW-NEXT: kandw %k3, %k1, %k1 11955; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 11956; AVX512BW-NEXT: korw %k6, %k1, %k1 11957; AVX512BW-NEXT: kandw %k5, %k1, %k1 11958; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 11959; AVX512BW-NEXT: korw %k6, %k1, %k1 11960; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 11961; AVX512BW-NEXT: kandw %k5, %k1, %k1 11962; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 11963; AVX512BW-NEXT: korw %k6, %k1, %k1 11964; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11965; AVX512BW-NEXT: kandw %k6, %k1, %k1 11966; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 11967; AVX512BW-NEXT: korw %k0, %k1, %k0 11968; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11969; AVX512BW-NEXT: kandw %k1, %k0, %k1 11970; AVX512BW-NEXT: kshiftrq $36, %k7, %k6 11971; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 11972; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 11973; AVX512BW-NEXT: korw %k7, %k1, %k1 11974; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 11975; AVX512BW-NEXT: kandw %k7, %k1, %k1 11976; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 11977; AVX512BW-NEXT: korw %k7, %k1, %k1 11978; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 11979; AVX512BW-NEXT: kandw %k7, %k1, %k1 11980; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 11981; AVX512BW-NEXT: korw %k7, %k1, %k1 11982; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 11983; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 11984; AVX512BW-NEXT: korw %k0, %k1, %k1 11985; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k1} {z} 11986; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 11987; AVX512BW-NEXT: kandw %k1, %k6, %k1 11988; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 11989; AVX512BW-NEXT: korw %k6, %k1, %k1 11990; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 11991; AVX512BW-NEXT: kandw %k6, %k1, %k1 11992; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 11993; AVX512BW-NEXT: korw %k0, %k1, %k0 11994; AVX512BW-NEXT: kandw %k2, %k0, %k1 11995; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload 11996; AVX512BW-NEXT: kshiftrq $37, %k7, %k0 11997; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 11998; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 11999; AVX512BW-NEXT: korw %k6, %k1, %k1 12000; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12001; AVX512BW-NEXT: kandw %k2, %k1, %k1 12002; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 12003; AVX512BW-NEXT: korw %k6, %k1, %k1 12004; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 12005; AVX512BW-NEXT: kandw %k6, %k1, %k1 12006; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 12007; AVX512BW-NEXT: korw %k6, %k1, %k1 12008; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 12009; AVX512BW-NEXT: kandw %k6, %k1, %k1 12010; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 12011; AVX512BW-NEXT: korw %k6, %k1, %k1 12012; AVX512BW-NEXT: kandw %k4, %k1, %k1 12013; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 12014; AVX512BW-NEXT: korw %k6, %k1, %k1 12015; AVX512BW-NEXT: kandw %k3, %k1, %k1 12016; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 12017; AVX512BW-NEXT: korw %k6, %k1, %k1 12018; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 12019; AVX512BW-NEXT: kandw %k3, %k1, %k1 12020; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 12021; AVX512BW-NEXT: korw %k0, %k1, %k0 12022; AVX512BW-NEXT: kandw %k5, %k0, %k6 12023; AVX512BW-NEXT: kshiftrq $38, %k7, %k0 12024; AVX512BW-NEXT: kmovq %k7, %k5 12025; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 12026; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 12027; AVX512BW-NEXT: korw %k7, %k6, %k6 12028; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 12029; AVX512BW-NEXT: kandw %k3, %k6, %k6 12030; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 12031; AVX512BW-NEXT: korw %k7, %k6, %k6 12032; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 12033; AVX512BW-NEXT: kandw %k4, %k6, %k6 12034; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 12035; AVX512BW-NEXT: korw %k7, %k6, %k6 12036; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 12037; AVX512BW-NEXT: kandw %k3, %k6, %k6 12038; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 12039; AVX512BW-NEXT: korw %k7, %k6, %k6 12040; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 12041; AVX512BW-NEXT: kandw %k3, %k6, %k6 12042; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 12043; AVX512BW-NEXT: korw %k7, %k6, %k6 12044; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 12045; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 12046; AVX512BW-NEXT: korw %k1, %k6, %k1 12047; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} 12048; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 12049; AVX512BW-NEXT: kandw %k1, %k0, %k1 12050; AVX512BW-NEXT: kmovq %k5, %k7 12051; AVX512BW-NEXT: kshiftrq $39, %k5, %k0 12052; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 12053; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 12054; AVX512BW-NEXT: korw %k6, %k1, %k1 12055; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 12056; AVX512BW-NEXT: kandw %k3, %k1, %k1 12057; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 12058; AVX512BW-NEXT: korw %k6, %k1, %k1 12059; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 12060; AVX512BW-NEXT: kandw %k3, %k1, %k1 12061; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 12062; AVX512BW-NEXT: korw %k6, %k1, %k1 12063; AVX512BW-NEXT: kandw %k2, %k1, %k1 12064; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 12065; AVX512BW-NEXT: korw %k6, %k1, %k1 12066; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 12067; AVX512BW-NEXT: kandw %k3, %k1, %k1 12068; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 12069; AVX512BW-NEXT: korw %k6, %k1, %k1 12070; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12071; AVX512BW-NEXT: kandw %k2, %k1, %k1 12072; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 12073; AVX512BW-NEXT: korw %k6, %k1, %k1 12074; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 12075; AVX512BW-NEXT: kandw %k5, %k1, %k1 12076; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 12077; AVX512BW-NEXT: korw %k0, %k1, %k0 12078; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 12079; AVX512BW-NEXT: kandw %k1, %k0, %k1 12080; AVX512BW-NEXT: kshiftrq $40, %k7, %k0 12081; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 12082; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 12083; AVX512BW-NEXT: korw %k7, %k1, %k1 12084; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12085; AVX512BW-NEXT: kandw %k2, %k1, %k1 12086; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 12087; AVX512BW-NEXT: korw %k7, %k1, %k1 12088; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12089; AVX512BW-NEXT: kandw %k2, %k1, %k1 12090; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 12091; AVX512BW-NEXT: korw %k7, %k1, %k1 12092; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12093; AVX512BW-NEXT: kandw %k2, %k1, %k1 12094; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 12095; AVX512BW-NEXT: korw %k7, %k1, %k1 12096; AVX512BW-NEXT: kandw %k4, %k1, %k1 12097; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 12098; AVX512BW-NEXT: korw %k7, %k1, %k1 12099; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12100; AVX512BW-NEXT: kandw %k2, %k1, %k1 12101; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 12102; AVX512BW-NEXT: korw %k6, %k1, %k1 12103; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 12104; AVX512BW-NEXT: kandw %k4, %k1, %k1 12105; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 12106; AVX512BW-NEXT: korw %k0, %k1, %k0 12107; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 12108; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 12109; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload 12110; AVX512BW-NEXT: kshiftrq $41, %k4, %k1 12111; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 12112; AVX512BW-NEXT: korw %k6, %k0, %k7 12113; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k7} {z} 12114; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 12115; AVX512BW-NEXT: kandw %k0, %k1, %k0 12116; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 12117; AVX512BW-NEXT: korw %k1, %k0, %k0 12118; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 12119; AVX512BW-NEXT: kandw %k1, %k0, %k0 12120; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 12121; AVX512BW-NEXT: korw %k1, %k0, %k0 12122; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 12123; AVX512BW-NEXT: kandw %k1, %k0, %k0 12124; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 12125; AVX512BW-NEXT: korw %k1, %k0, %k0 12126; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 12127; AVX512BW-NEXT: kandw %k1, %k0, %k0 12128; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 12129; AVX512BW-NEXT: korw %k1, %k0, %k0 12130; AVX512BW-NEXT: kandw %k3, %k0, %k0 12131; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 12132; AVX512BW-NEXT: korw %k1, %k0, %k0 12133; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 12134; AVX512BW-NEXT: kandw %k1, %k0, %k1 12135; AVX512BW-NEXT: kshiftrq $42, %k4, %k0 12136; AVX512BW-NEXT: kmovq %k4, %k3 12137; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 12138; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 12139; AVX512BW-NEXT: korw %k6, %k1, %k1 12140; AVX512BW-NEXT: kandw %k5, %k1, %k1 12141; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 12142; AVX512BW-NEXT: korw %k6, %k1, %k1 12143; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 12144; AVX512BW-NEXT: kandw %k4, %k1, %k1 12145; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 12146; AVX512BW-NEXT: korw %k6, %k1, %k1 12147; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 12148; AVX512BW-NEXT: kandw %k4, %k1, %k1 12149; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 12150; AVX512BW-NEXT: korw %k6, %k1, %k1 12151; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 12152; AVX512BW-NEXT: kandw %k4, %k1, %k1 12153; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 12154; AVX512BW-NEXT: korw %k6, %k1, %k1 12155; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 12156; AVX512BW-NEXT: kandw %k4, %k1, %k1 12157; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 12158; AVX512BW-NEXT: korw %k6, %k1, %k1 12159; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 12160; AVX512BW-NEXT: kandw %k5, %k1, %k1 12161; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 12162; AVX512BW-NEXT: korw %k0, %k1, %k0 12163; AVX512BW-NEXT: kandw %k2, %k0, %k1 12164; AVX512BW-NEXT: kshiftrq $43, %k3, %k6 12165; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 12166; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 12167; AVX512BW-NEXT: korw %k7, %k1, %k1 12168; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12169; AVX512BW-NEXT: kandw %k2, %k1, %k1 12170; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 12171; AVX512BW-NEXT: korw %k7, %k1, %k1 12172; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 12173; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 12174; AVX512BW-NEXT: korw %k0, %k1, %k1 12175; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k1} {z} 12176; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 12177; AVX512BW-NEXT: kandw %k5, %k6, %k1 12178; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 12179; AVX512BW-NEXT: korw %k6, %k1, %k1 12180; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12181; AVX512BW-NEXT: kandw %k2, %k1, %k1 12182; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 12183; AVX512BW-NEXT: korw %k6, %k1, %k1 12184; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12185; AVX512BW-NEXT: kandw %k2, %k1, %k1 12186; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 12187; AVX512BW-NEXT: korw %k0, %k1, %k0 12188; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 12189; AVX512BW-NEXT: kandw %k1, %k0, %k1 12190; AVX512BW-NEXT: kmovq %k3, %k7 12191; AVX512BW-NEXT: kshiftrq $44, %k3, %k0 12192; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 12193; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 12194; AVX512BW-NEXT: korw %k6, %k1, %k1 12195; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12196; AVX512BW-NEXT: kandw %k2, %k1, %k1 12197; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 12198; AVX512BW-NEXT: korw %k6, %k1, %k1 12199; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 12200; AVX512BW-NEXT: kandw %k3, %k1, %k1 12201; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 12202; AVX512BW-NEXT: korw %k6, %k1, %k1 12203; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12204; AVX512BW-NEXT: kandw %k2, %k1, %k1 12205; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 12206; AVX512BW-NEXT: korw %k6, %k1, %k1 12207; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12208; AVX512BW-NEXT: kandw %k2, %k1, %k1 12209; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 12210; AVX512BW-NEXT: korw %k6, %k1, %k1 12211; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12212; AVX512BW-NEXT: kandw %k2, %k1, %k1 12213; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 12214; AVX512BW-NEXT: korw %k6, %k1, %k1 12215; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 12216; AVX512BW-NEXT: kandw %k6, %k1, %k1 12217; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 12218; AVX512BW-NEXT: korw %k0, %k1, %k0 12219; AVX512BW-NEXT: kandw %k4, %k0, %k6 12220; AVX512BW-NEXT: kshiftrq $45, %k7, %k0 12221; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 12222; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 12223; AVX512BW-NEXT: korw %k7, %k6, %k6 12224; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 12225; AVX512BW-NEXT: kandw %k4, %k6, %k6 12226; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 12227; AVX512BW-NEXT: korw %k7, %k6, %k6 12228; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 12229; AVX512BW-NEXT: kandw %k7, %k6, %k6 12230; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 12231; AVX512BW-NEXT: korw %k7, %k6, %k6 12232; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 12233; AVX512BW-NEXT: kandw %k7, %k6, %k6 12234; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 12235; AVX512BW-NEXT: korw %k7, %k6, %k6 12236; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 12237; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 12238; AVX512BW-NEXT: korw %k1, %k6, %k6 12239; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k6} {z} 12240; AVX512BW-NEXT: kandw %k5, %k0, %k0 12241; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 12242; AVX512BW-NEXT: korw %k1, %k0, %k0 12243; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 12244; AVX512BW-NEXT: kandw %k1, %k0, %k1 12245; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload 12246; AVX512BW-NEXT: kshiftrq $46, %k5, %k0 12247; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 12248; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 12249; AVX512BW-NEXT: korw %k6, %k1, %k1 12250; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 12251; AVX512BW-NEXT: kandw %k6, %k1, %k1 12252; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 12253; AVX512BW-NEXT: korw %k6, %k1, %k1 12254; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 12255; AVX512BW-NEXT: kandw %k6, %k1, %k1 12256; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 12257; AVX512BW-NEXT: korw %k6, %k1, %k1 12258; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 12259; AVX512BW-NEXT: kandw %k6, %k1, %k1 12260; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 12261; AVX512BW-NEXT: korw %k6, %k1, %k1 12262; AVX512BW-NEXT: kandw %k3, %k1, %k1 12263; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 12264; AVX512BW-NEXT: korw %k6, %k1, %k1 12265; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 12266; AVX512BW-NEXT: kandw %k3, %k1, %k1 12267; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 12268; AVX512BW-NEXT: korw %k6, %k1, %k1 12269; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 12270; AVX512BW-NEXT: kandw %k3, %k1, %k1 12271; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 12272; AVX512BW-NEXT: korw %k0, %k1, %k0 12273; AVX512BW-NEXT: kandw %k2, %k0, %k6 12274; AVX512BW-NEXT: kshiftrq $47, %k5, %k1 12275; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 12276; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 12277; AVX512BW-NEXT: korw %k7, %k6, %k6 12278; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 12279; AVX512BW-NEXT: kandw %k3, %k6, %k6 12280; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 12281; AVX512BW-NEXT: korw %k7, %k6, %k6 12282; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12283; AVX512BW-NEXT: kandw %k2, %k6, %k6 12284; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 12285; AVX512BW-NEXT: korw %k7, %k6, %k6 12286; AVX512BW-NEXT: kandw %k4, %k6, %k6 12287; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 12288; AVX512BW-NEXT: korw %k7, %k6, %k6 12289; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12290; AVX512BW-NEXT: kandw %k2, %k6, %k6 12291; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 12292; AVX512BW-NEXT: korw %k7, %k6, %k6 12293; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12294; AVX512BW-NEXT: kandw %k2, %k6, %k6 12295; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 12296; AVX512BW-NEXT: korw %k1, %k6, %k1 12297; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 12298; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 12299; AVX512BW-NEXT: korw %k0, %k1, %k1 12300; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k1} {z} 12301; AVX512BW-NEXT: kshiftrq $48, %k5, %k0 12302; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 12303; AVX512BW-NEXT: kandw %k1, %k0, %k1 12304; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 12305; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 12306; AVX512BW-NEXT: korw %k6, %k1, %k1 12307; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 12308; AVX512BW-NEXT: kandw %k4, %k1, %k1 12309; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 12310; AVX512BW-NEXT: korw %k6, %k1, %k1 12311; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 12312; AVX512BW-NEXT: kandw %k6, %k1, %k1 12313; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 12314; AVX512BW-NEXT: korw %k6, %k1, %k1 12315; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 12316; AVX512BW-NEXT: kandw %k6, %k1, %k1 12317; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 12318; AVX512BW-NEXT: korw %k6, %k1, %k1 12319; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 12320; AVX512BW-NEXT: kandw %k6, %k1, %k1 12321; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 12322; AVX512BW-NEXT: korw %k6, %k1, %k1 12323; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 12324; AVX512BW-NEXT: kandw %k6, %k1, %k1 12325; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 12326; AVX512BW-NEXT: korw %k0, %k1, %k0 12327; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 12328; AVX512BW-NEXT: kandw %k1, %k0, %k1 12329; AVX512BW-NEXT: kshiftrq $49, %k5, %k0 12330; AVX512BW-NEXT: kmovq %k5, %k7 12331; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 12332; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 12333; AVX512BW-NEXT: korw %k6, %k1, %k1 12334; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 12335; AVX512BW-NEXT: kandw %k5, %k1, %k1 12336; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 12337; AVX512BW-NEXT: korw %k6, %k1, %k1 12338; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 12339; AVX512BW-NEXT: kandw %k5, %k1, %k1 12340; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 12341; AVX512BW-NEXT: korw %k6, %k1, %k1 12342; AVX512BW-NEXT: kandw %k3, %k1, %k1 12343; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 12344; AVX512BW-NEXT: korw %k6, %k1, %k1 12345; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 12346; AVX512BW-NEXT: kandw %k3, %k1, %k1 12347; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 12348; AVX512BW-NEXT: korw %k6, %k1, %k1 12349; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 12350; AVX512BW-NEXT: kandw %k5, %k1, %k1 12351; AVX512BW-NEXT: kshiftrw $3, %k0, %k6 12352; AVX512BW-NEXT: korw %k6, %k1, %k1 12353; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 12354; AVX512BW-NEXT: kandw %k5, %k1, %k1 12355; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 12356; AVX512BW-NEXT: korw %k0, %k1, %k0 12357; AVX512BW-NEXT: kandw %k2, %k0, %k0 12358; AVX512BW-NEXT: kshiftrq $50, %k7, %k1 12359; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 12360; AVX512BW-NEXT: korw %k6, %k0, %k0 12361; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 12362; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 12363; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 12364; AVX512BW-NEXT: korw %k6, %k0, %k7 12365; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm21 {%k7} {z} 12366; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 12367; AVX512BW-NEXT: kandw %k0, %k1, %k0 12368; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 12369; AVX512BW-NEXT: korw %k1, %k0, %k0 12370; AVX512BW-NEXT: kandw %k4, %k0, %k0 12371; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 12372; AVX512BW-NEXT: korw %k1, %k0, %k0 12373; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12374; AVX512BW-NEXT: kandw %k2, %k0, %k0 12375; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 12376; AVX512BW-NEXT: korw %k1, %k0, %k0 12377; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 12378; AVX512BW-NEXT: kandw %k5, %k0, %k0 12379; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 12380; AVX512BW-NEXT: korw %k1, %k0, %k0 12381; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 12382; AVX512BW-NEXT: kandw %k1, %k0, %k1 12383; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload 12384; AVX512BW-NEXT: kshiftrq $51, %k7, %k0 12385; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 12386; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 12387; AVX512BW-NEXT: korw %k6, %k1, %k1 12388; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 12389; AVX512BW-NEXT: kandw %k4, %k1, %k1 12390; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 12391; AVX512BW-NEXT: korw %k6, %k1, %k1 12392; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 12393; AVX512BW-NEXT: kandw %k4, %k1, %k1 12394; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 12395; AVX512BW-NEXT: korw %k6, %k1, %k1 12396; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 12397; AVX512BW-NEXT: kandw %k4, %k1, %k1 12398; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 12399; AVX512BW-NEXT: korw %k6, %k1, %k1 12400; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 12401; AVX512BW-NEXT: kandw %k6, %k1, %k1 12402; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 12403; AVX512BW-NEXT: korw %k6, %k1, %k1 12404; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 12405; AVX512BW-NEXT: kandw %k6, %k1, %k1 12406; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 12407; AVX512BW-NEXT: korw %k6, %k1, %k1 12408; AVX512BW-NEXT: kandw %k3, %k1, %k1 12409; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 12410; AVX512BW-NEXT: korw %k0, %k1, %k0 12411; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 12412; AVX512BW-NEXT: kandw %k1, %k0, %k1 12413; AVX512BW-NEXT: kshiftrq $52, %k7, %k6 12414; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 12415; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 12416; AVX512BW-NEXT: korw %k7, %k1, %k1 12417; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 12418; AVX512BW-NEXT: kandw %k3, %k1, %k1 12419; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 12420; AVX512BW-NEXT: korw %k7, %k1, %k1 12421; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 12422; AVX512BW-NEXT: kandw %k3, %k1, %k1 12423; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 12424; AVX512BW-NEXT: korw %k7, %k1, %k1 12425; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 12426; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 12427; AVX512BW-NEXT: korw %k0, %k1, %k1 12428; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm22 {%k1} {z} 12429; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 12430; AVX512BW-NEXT: kandw %k3, %k6, %k1 12431; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 12432; AVX512BW-NEXT: korw %k6, %k1, %k1 12433; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 12434; AVX512BW-NEXT: kandw %k6, %k1, %k1 12435; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 12436; AVX512BW-NEXT: korw %k0, %k1, %k0 12437; AVX512BW-NEXT: kandw %k2, %k0, %k1 12438; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload 12439; AVX512BW-NEXT: kshiftrq $53, %k7, %k0 12440; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 12441; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 12442; AVX512BW-NEXT: korw %k6, %k1, %k1 12443; AVX512BW-NEXT: kandw %k5, %k1, %k1 12444; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 12445; AVX512BW-NEXT: korw %k6, %k1, %k1 12446; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 12447; AVX512BW-NEXT: kandw %k5, %k1, %k1 12448; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 12449; AVX512BW-NEXT: korw %k6, %k1, %k1 12450; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12451; AVX512BW-NEXT: kandw %k2, %k1, %k1 12452; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 12453; AVX512BW-NEXT: korw %k6, %k1, %k1 12454; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12455; AVX512BW-NEXT: kandw %k2, %k1, %k1 12456; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 12457; AVX512BW-NEXT: korw %k6, %k1, %k1 12458; AVX512BW-NEXT: kandw %k4, %k1, %k1 12459; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 12460; AVX512BW-NEXT: korw %k6, %k1, %k1 12461; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 12462; AVX512BW-NEXT: kandw %k4, %k1, %k1 12463; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 12464; AVX512BW-NEXT: korw %k0, %k1, %k0 12465; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 12466; AVX512BW-NEXT: kandw %k1, %k0, %k6 12467; AVX512BW-NEXT: kshiftrq $54, %k7, %k0 12468; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 12469; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 12470; AVX512BW-NEXT: korw %k7, %k6, %k6 12471; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 12472; AVX512BW-NEXT: kandw %k4, %k6, %k6 12473; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 12474; AVX512BW-NEXT: korw %k7, %k6, %k6 12475; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 12476; AVX512BW-NEXT: kandw %k4, %k6, %k6 12477; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 12478; AVX512BW-NEXT: korw %k7, %k6, %k6 12479; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 12480; AVX512BW-NEXT: kandw %k4, %k6, %k6 12481; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 12482; AVX512BW-NEXT: korw %k7, %k6, %k6 12483; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 12484; AVX512BW-NEXT: kandw %k7, %k6, %k6 12485; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 12486; AVX512BW-NEXT: korw %k7, %k6, %k6 12487; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 12488; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 12489; AVX512BW-NEXT: korw %k1, %k6, %k1 12490; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z} 12491; AVX512BW-NEXT: kandw %k3, %k0, %k1 12492; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload 12493; AVX512BW-NEXT: kshiftrq $55, %k7, %k0 12494; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 12495; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 12496; AVX512BW-NEXT: korw %k6, %k1, %k1 12497; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 12498; AVX512BW-NEXT: kandw %k3, %k1, %k1 12499; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 12500; AVX512BW-NEXT: korw %k6, %k1, %k1 12501; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 12502; AVX512BW-NEXT: kandw %k3, %k1, %k1 12503; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 12504; AVX512BW-NEXT: korw %k6, %k1, %k1 12505; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 12506; AVX512BW-NEXT: kandw %k3, %k1, %k1 12507; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 12508; AVX512BW-NEXT: korw %k6, %k1, %k1 12509; AVX512BW-NEXT: kandw %k5, %k1, %k1 12510; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 12511; AVX512BW-NEXT: korw %k6, %k1, %k1 12512; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 12513; AVX512BW-NEXT: kandw %k5, %k1, %k1 12514; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 12515; AVX512BW-NEXT: korw %k6, %k1, %k1 12516; AVX512BW-NEXT: kandw %k2, %k1, %k1 12517; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 12518; AVX512BW-NEXT: korw %k0, %k1, %k0 12519; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 12520; AVX512BW-NEXT: kandw %k1, %k0, %k1 12521; AVX512BW-NEXT: kshiftrq $56, %k7, %k0 12522; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 12523; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 12524; AVX512BW-NEXT: korw %k7, %k1, %k1 12525; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12526; AVX512BW-NEXT: kandw %k2, %k1, %k1 12527; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 12528; AVX512BW-NEXT: korw %k7, %k1, %k1 12529; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12530; AVX512BW-NEXT: kandw %k2, %k1, %k1 12531; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 12532; AVX512BW-NEXT: korw %k7, %k1, %k1 12533; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12534; AVX512BW-NEXT: kandw %k2, %k1, %k1 12535; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 12536; AVX512BW-NEXT: korw %k7, %k1, %k1 12537; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12538; AVX512BW-NEXT: kandw %k2, %k1, %k1 12539; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 12540; AVX512BW-NEXT: korw %k7, %k1, %k1 12541; AVX512BW-NEXT: kandw %k4, %k1, %k1 12542; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 12543; AVX512BW-NEXT: korw %k6, %k1, %k1 12544; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 12545; AVX512BW-NEXT: kandw %k5, %k1, %k1 12546; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 12547; AVX512BW-NEXT: korw %k0, %k1, %k0 12548; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 12549; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 12550; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload 12551; AVX512BW-NEXT: kshiftrq $57, %k4, %k1 12552; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 12553; AVX512BW-NEXT: korw %k6, %k0, %k7 12554; AVX512BW-NEXT: vmovdqa32 1536(%rsi), %zmm24 {%k7} {z} 12555; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload 12556; AVX512BW-NEXT: kandw %k0, %k1, %k0 12557; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 12558; AVX512BW-NEXT: korw %k1, %k0, %k0 12559; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 12560; AVX512BW-NEXT: kandw %k1, %k0, %k0 12561; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 12562; AVX512BW-NEXT: korw %k1, %k0, %k0 12563; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 12564; AVX512BW-NEXT: kandw %k1, %k0, %k0 12565; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 12566; AVX512BW-NEXT: korw %k1, %k0, %k0 12567; AVX512BW-NEXT: kandw %k3, %k0, %k0 12568; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 12569; AVX512BW-NEXT: korw %k1, %k0, %k0 12570; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 12571; AVX512BW-NEXT: kandw %k3, %k0, %k0 12572; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 12573; AVX512BW-NEXT: korw %k1, %k0, %k0 12574; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12575; AVX512BW-NEXT: kandw %k2, %k0, %k1 12576; AVX512BW-NEXT: kmovq %k4, %k7 12577; AVX512BW-NEXT: kshiftrq $58, %k4, %k0 12578; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 12579; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 12580; AVX512BW-NEXT: korw %k6, %k1, %k1 12581; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 12582; AVX512BW-NEXT: kandw %k4, %k1, %k1 12583; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 12584; AVX512BW-NEXT: korw %k6, %k1, %k1 12585; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 12586; AVX512BW-NEXT: kandw %k4, %k1, %k1 12587; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 12588; AVX512BW-NEXT: korw %k6, %k1, %k1 12589; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 12590; AVX512BW-NEXT: kandw %k4, %k1, %k1 12591; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 12592; AVX512BW-NEXT: korw %k6, %k1, %k1 12593; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 12594; AVX512BW-NEXT: kandw %k6, %k1, %k1 12595; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 12596; AVX512BW-NEXT: korw %k6, %k1, %k1 12597; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 12598; AVX512BW-NEXT: kandw %k6, %k1, %k1 12599; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 12600; AVX512BW-NEXT: korw %k6, %k1, %k1 12601; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 12602; AVX512BW-NEXT: kandw %k6, %k1, %k1 12603; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 12604; AVX512BW-NEXT: korw %k0, %k1, %k0 12605; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 12606; AVX512BW-NEXT: kandw %k1, %k0, %k1 12607; AVX512BW-NEXT: kshiftrq $59, %k7, %k6 12608; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 12609; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 12610; AVX512BW-NEXT: korw %k7, %k1, %k1 12611; AVX512BW-NEXT: kandw %k5, %k1, %k1 12612; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 12613; AVX512BW-NEXT: korw %k7, %k1, %k1 12614; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 12615; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 12616; AVX512BW-NEXT: korw %k0, %k1, %k1 12617; AVX512BW-NEXT: vmovdqa32 1600(%rsi), %zmm25 {%k1} {z} 12618; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 12619; AVX512BW-NEXT: kandw %k1, %k6, %k1 12620; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 12621; AVX512BW-NEXT: korw %k6, %k1, %k1 12622; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 12623; AVX512BW-NEXT: kandw %k5, %k1, %k1 12624; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 12625; AVX512BW-NEXT: korw %k6, %k1, %k1 12626; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload 12627; AVX512BW-NEXT: kandw %k5, %k1, %k1 12628; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 12629; AVX512BW-NEXT: korw %k0, %k1, %k0 12630; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 12631; AVX512BW-NEXT: kandw %k1, %k0, %k1 12632; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload 12633; AVX512BW-NEXT: kshiftrq $60, %k5, %k0 12634; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 12635; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 12636; AVX512BW-NEXT: korw %k6, %k1, %k1 12637; AVX512BW-NEXT: kandw %k3, %k1, %k1 12638; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 12639; AVX512BW-NEXT: korw %k6, %k1, %k1 12640; AVX512BW-NEXT: kandw %k2, %k1, %k1 12641; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 12642; AVX512BW-NEXT: korw %k6, %k1, %k1 12643; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12644; AVX512BW-NEXT: kandw %k2, %k1, %k1 12645; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 12646; AVX512BW-NEXT: korw %k6, %k1, %k1 12647; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12648; AVX512BW-NEXT: kandw %k2, %k1, %k1 12649; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 12650; AVX512BW-NEXT: korw %k6, %k1, %k1 12651; AVX512BW-NEXT: kandw %k4, %k1, %k1 12652; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 12653; AVX512BW-NEXT: korw %k6, %k1, %k1 12654; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload 12655; AVX512BW-NEXT: kandw %k3, %k1, %k1 12656; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 12657; AVX512BW-NEXT: korw %k0, %k1, %k0 12658; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload 12659; AVX512BW-NEXT: kandw %k4, %k0, %k6 12660; AVX512BW-NEXT: kshiftrq $61, %k5, %k0 12661; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 12662; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 12663; AVX512BW-NEXT: korw %k7, %k6, %k6 12664; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12665; AVX512BW-NEXT: kandw %k2, %k6, %k6 12666; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 12667; AVX512BW-NEXT: korw %k7, %k6, %k6 12668; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 12669; AVX512BW-NEXT: kandw %k7, %k6, %k6 12670; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 12671; AVX512BW-NEXT: korw %k7, %k6, %k6 12672; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload 12673; AVX512BW-NEXT: kandw %k7, %k6, %k6 12674; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 12675; AVX512BW-NEXT: korw %k7, %k6, %k6 12676; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 12677; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 12678; AVX512BW-NEXT: korw %k1, %k6, %k6 12679; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k6} {z} 12680; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 12681; AVX512BW-NEXT: kandw %k6, %k0, %k0 12682; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 12683; AVX512BW-NEXT: korw %k1, %k0, %k0 12684; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 12685; AVX512BW-NEXT: kandw %k1, %k0, %k1 12686; AVX512BW-NEXT: kshiftrq $62, %k5, %k0 12687; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 12688; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 12689; AVX512BW-NEXT: korw %k6, %k1, %k1 12690; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 12691; AVX512BW-NEXT: kandw %k6, %k1, %k1 12692; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 12693; AVX512BW-NEXT: korw %k6, %k1, %k1 12694; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 12695; AVX512BW-NEXT: kandw %k6, %k1, %k1 12696; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 12697; AVX512BW-NEXT: korw %k6, %k1, %k1 12698; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 12699; AVX512BW-NEXT: kandw %k6, %k1, %k1 12700; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 12701; AVX512BW-NEXT: korw %k6, %k1, %k1 12702; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 12703; AVX512BW-NEXT: kandw %k6, %k1, %k1 12704; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 12705; AVX512BW-NEXT: korw %k6, %k1, %k1 12706; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 12707; AVX512BW-NEXT: kandw %k6, %k1, %k1 12708; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 12709; AVX512BW-NEXT: korw %k6, %k1, %k1 12710; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload 12711; AVX512BW-NEXT: kandw %k6, %k1, %k1 12712; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 12713; AVX512BW-NEXT: korw %k0, %k1, %k0 12714; AVX512BW-NEXT: kshiftrq $63, %k5, %k5 12715; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 12716; AVX512BW-NEXT: kandw %k1, %k0, %k1 12717; AVX512BW-NEXT: kshiftlw $15, %k5, %k0 12718; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 12719; AVX512BW-NEXT: korw %k6, %k1, %k1 12720; AVX512BW-NEXT: kandw %k3, %k1, %k1 12721; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 12722; AVX512BW-NEXT: korw %k6, %k1, %k1 12723; AVX512BW-NEXT: kandw %k4, %k1, %k1 12724; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 12725; AVX512BW-NEXT: korw %k6, %k1, %k1 12726; AVX512BW-NEXT: kandw %k2, %k1, %k1 12727; AVX512BW-NEXT: kshiftrw $3, %k0, %k4 12728; AVX512BW-NEXT: korw %k4, %k1, %k1 12729; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12730; AVX512BW-NEXT: kandw %k2, %k1, %k1 12731; AVX512BW-NEXT: kshiftrw $2, %k0, %k3 12732; AVX512BW-NEXT: korw %k3, %k1, %k1 12733; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 12734; AVX512BW-NEXT: kandw %k2, %k1, %k1 12735; AVX512BW-NEXT: kshiftlw $14, %k5, %k2 12736; AVX512BW-NEXT: korw %k2, %k1, %k1 12737; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 12738; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 12739; AVX512BW-NEXT: korw %k0, %k1, %k1 12740; AVX512BW-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k1} {z} 12741; AVX512BW-NEXT: vmovdqa64 %zmm27, 1728(%rdx) 12742; AVX512BW-NEXT: vmovdqa64 %zmm26, 1664(%rdx) 12743; AVX512BW-NEXT: vmovdqa64 %zmm25, 1600(%rdx) 12744; AVX512BW-NEXT: vmovdqa64 %zmm24, 1536(%rdx) 12745; AVX512BW-NEXT: vmovdqa64 %zmm23, 1472(%rdx) 12746; AVX512BW-NEXT: vmovdqa64 %zmm22, 1408(%rdx) 12747; AVX512BW-NEXT: vmovdqa64 %zmm21, 1344(%rdx) 12748; AVX512BW-NEXT: vmovdqa64 %zmm20, 1280(%rdx) 12749; AVX512BW-NEXT: vmovdqa64 %zmm19, 1216(%rdx) 12750; AVX512BW-NEXT: vmovdqa64 %zmm18, 1152(%rdx) 12751; AVX512BW-NEXT: vmovdqa64 %zmm17, 1088(%rdx) 12752; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx) 12753; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%rdx) 12754; AVX512BW-NEXT: vmovdqa64 %zmm14, 896(%rdx) 12755; AVX512BW-NEXT: vmovdqa64 %zmm13, 832(%rdx) 12756; AVX512BW-NEXT: vmovdqa64 %zmm12, 768(%rdx) 12757; AVX512BW-NEXT: vmovdqa64 %zmm11, 704(%rdx) 12758; AVX512BW-NEXT: vmovdqa64 %zmm10, 640(%rdx) 12759; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx) 12760; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rdx) 12761; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) 12762; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) 12763; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) 12764; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) 12765; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) 12766; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) 12767; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) 12768; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) 12769; AVX512BW-NEXT: vzeroupper 12770; AVX512BW-NEXT: retq 12771 %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 12772 %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <448 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63> 12773 %data = call <448 x i32> @llvm.masked.load.v448i32.p0(ptr %in.vec, i32 64, <448 x i1> %tgt.mask, <448 x i32> poison) 12774 store <448 x i32> %data, ptr %out.vec, align 64 12775 ret void 12776} 12777 12778define void @mask_replication_factor8_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 12779; AVX512F-ONLY-LABEL: mask_replication_factor8_vf2: 12780; AVX512F-ONLY: # %bb.0: 12781; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 12782; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 12783; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] 12784; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 12785; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 12786; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 12787; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) 12788; AVX512F-ONLY-NEXT: vzeroupper 12789; AVX512F-ONLY-NEXT: retq 12790; 12791; AVX512DQ-LABEL: mask_replication_factor8_vf2: 12792; AVX512DQ: # %bb.0: 12793; AVX512DQ-NEXT: kmovw (%rdi), %k0 12794; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 12795; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] 12796; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 12797; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 12798; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 12799; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) 12800; AVX512DQ-NEXT: vzeroupper 12801; AVX512DQ-NEXT: retq 12802; 12803; AVX512BW-LABEL: mask_replication_factor8_vf2: 12804; AVX512BW: # %bb.0: 12805; AVX512BW-NEXT: kmovw (%rdi), %k1 12806; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 12807; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] 12808; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 12809; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 12810; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 12811; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) 12812; AVX512BW-NEXT: vzeroupper 12813; AVX512BW-NEXT: retq 12814 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 12815 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <2 x i32> <i32 0, i32 1> 12816 %tgt.mask = shufflevector <2 x i1> %src.mask, <2 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 12817 %data = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr %in.vec, i32 64, <16 x i1> %tgt.mask, <16 x i32> poison) 12818 store <16 x i32> %data, ptr %out.vec, align 64 12819 ret void 12820} 12821 12822define void @mask_replication_factor8_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 12823; AVX512F-ONLY-LABEL: mask_replication_factor8_vf4: 12824; AVX512F-ONLY: # %bb.0: 12825; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 12826; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 12827; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] 12828; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 12829; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 12830; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] 12831; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 12832; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 12833; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} 12834; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} 12835; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) 12836; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) 12837; AVX512F-ONLY-NEXT: vzeroupper 12838; AVX512F-ONLY-NEXT: retq 12839; 12840; AVX512DQ-LABEL: mask_replication_factor8_vf4: 12841; AVX512DQ: # %bb.0: 12842; AVX512DQ-NEXT: kmovw (%rdi), %k0 12843; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 12844; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] 12845; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 12846; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 12847; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] 12848; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 12849; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 12850; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} 12851; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} 12852; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) 12853; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) 12854; AVX512DQ-NEXT: vzeroupper 12855; AVX512DQ-NEXT: retq 12856; 12857; AVX512BW-LABEL: mask_replication_factor8_vf4: 12858; AVX512BW: # %bb.0: 12859; AVX512BW-NEXT: kmovd (%rdi), %k0 12860; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 12861; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] 12862; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 12863; AVX512BW-NEXT: vpmovw2m %zmm0, %k1 12864; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} 12865; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 12866; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} 12867; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) 12868; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) 12869; AVX512BW-NEXT: vzeroupper 12870; AVX512BW-NEXT: retq 12871 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 12872 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 12873 %tgt.mask = shufflevector <4 x i1> %src.mask, <4 x i1> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 12874 %data = call <32 x i32> @llvm.masked.load.v32i32.p0(ptr %in.vec, i32 64, <32 x i1> %tgt.mask, <32 x i32> poison) 12875 store <32 x i32> %data, ptr %out.vec, align 64 12876 ret void 12877} 12878 12879define void @mask_replication_factor8_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 12880; AVX512F-ONLY-LABEL: mask_replication_factor8_vf8: 12881; AVX512F-ONLY: # %bb.0: 12882; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 12883; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 12884; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] 12885; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 12886; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 12887; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] 12888; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 12889; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 12890; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] 12891; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 12892; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 12893; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] 12894; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 12895; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 12896; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} 12897; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} 12898; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} 12899; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} 12900; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) 12901; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) 12902; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) 12903; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) 12904; AVX512F-ONLY-NEXT: vzeroupper 12905; AVX512F-ONLY-NEXT: retq 12906; 12907; AVX512DQ-LABEL: mask_replication_factor8_vf8: 12908; AVX512DQ: # %bb.0: 12909; AVX512DQ-NEXT: kmovw (%rdi), %k0 12910; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 12911; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] 12912; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 12913; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 12914; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] 12915; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 12916; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 12917; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] 12918; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 12919; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 12920; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] 12921; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 12922; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 12923; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} 12924; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} 12925; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} 12926; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} 12927; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) 12928; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) 12929; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) 12930; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) 12931; AVX512DQ-NEXT: vzeroupper 12932; AVX512DQ-NEXT: retq 12933; 12934; AVX512BW-ONLY-LABEL: mask_replication_factor8_vf8: 12935; AVX512BW-ONLY: # %bb.0: 12936; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 12937; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 12938; AVX512BW-ONLY-NEXT: vpbroadcastq %xmm0, %zmm0 12939; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55] 12940; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 12941; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2 12942; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} 12943; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} 12944; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 12945; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} 12946; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 12947; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} 12948; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) 12949; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) 12950; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) 12951; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) 12952; AVX512BW-ONLY-NEXT: vzeroupper 12953; AVX512BW-ONLY-NEXT: retq 12954; 12955; AVX512VBMI-ONLY-LABEL: mask_replication_factor8_vf8: 12956; AVX512VBMI-ONLY: # %bb.0: 12957; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 12958; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 12959; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] 12960; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 12961; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 12962; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 12963; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} 12964; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} 12965; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 12966; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} 12967; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 12968; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} 12969; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) 12970; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) 12971; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) 12972; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) 12973; AVX512VBMI-ONLY-NEXT: vzeroupper 12974; AVX512VBMI-ONLY-NEXT: retq 12975 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 12976 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 12977 %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> 12978 %data = call <64 x i32> @llvm.masked.load.v64i32.p0(ptr %in.vec, i32 64, <64 x i1> %tgt.mask, <64 x i32> poison) 12979 store <64 x i32> %data, ptr %out.vec, align 64 12980 ret void 12981} 12982 12983define void @mask_replication_factor8_vf16(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 12984; AVX512F-ONLY-LABEL: mask_replication_factor8_vf16: 12985; AVX512F-ONLY: # %bb.0: 12986; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 12987; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 12988; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] 12989; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 12990; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 12991; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 12992; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] 12993; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 12994; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 12995; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] 12996; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 12997; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 12998; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] 12999; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 13000; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 13001; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] 13002; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 13003; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 13004; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] 13005; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 13006; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k6 13007; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] 13008; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 13009; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k7 13010; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] 13011; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 13012; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 13013; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} 13014; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z} 13015; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z} 13016; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} 13017; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} 13018; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} 13019; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} 13020; AVX512F-ONLY-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 13021; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} 13022; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) 13023; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) 13024; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) 13025; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) 13026; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) 13027; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) 13028; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) 13029; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) 13030; AVX512F-ONLY-NEXT: vzeroupper 13031; AVX512F-ONLY-NEXT: retq 13032; 13033; AVX512DQ-LABEL: mask_replication_factor8_vf16: 13034; AVX512DQ: # %bb.0: 13035; AVX512DQ-NEXT: kmovw (%rdi), %k0 13036; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 13037; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] 13038; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 13039; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 13040; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 13041; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] 13042; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 13043; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 13044; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] 13045; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 13046; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 13047; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] 13048; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 13049; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 13050; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] 13051; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 13052; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 13053; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] 13054; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 13055; AVX512DQ-NEXT: vpmovd2m %zmm1, %k6 13056; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] 13057; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 13058; AVX512DQ-NEXT: vpmovd2m %zmm1, %k7 13059; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] 13060; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 13061; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 13062; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} 13063; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z} 13064; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z} 13065; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} 13066; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} 13067; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} 13068; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} 13069; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 13070; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} 13071; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rdx) 13072; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rdx) 13073; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx) 13074; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rdx) 13075; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) 13076; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) 13077; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) 13078; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) 13079; AVX512DQ-NEXT: vzeroupper 13080; AVX512DQ-NEXT: retq 13081; 13082; AVX512BW-LABEL: mask_replication_factor8_vf16: 13083; AVX512BW: # %bb.0: 13084; AVX512BW-NEXT: kmovw (%rdi), %k0 13085; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 13086; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] 13087; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm0[8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,26,26,26,26,26,26,26,26,27,27,27,27,27,27,27,27,44,44,44,44,44,44,44,44,45,45,45,45,45,45,45,45,62,62,62,62,62,62,62,62,63,63,63,63,63,63,63,63] 13088; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 13089; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55] 13090; AVX512BW-NEXT: vpmovb2m %zmm0, %k2 13091; AVX512BW-NEXT: kshiftrq $16, %k2, %k3 13092; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} 13093; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} 13094; AVX512BW-NEXT: kshiftrq $48, %k2, %k3 13095; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} 13096; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 13097; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} 13098; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 13099; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} 13100; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} 13101; AVX512BW-NEXT: kshiftrq $48, %k1, %k2 13102; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} 13103; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 13104; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} 13105; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rdx) 13106; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%rdx) 13107; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rdx) 13108; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rdx) 13109; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx) 13110; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rdx) 13111; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) 13112; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) 13113; AVX512BW-NEXT: vzeroupper 13114; AVX512BW-NEXT: retq 13115 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 13116 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 13117 %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <128 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 13118 %data = call <128 x i32> @llvm.masked.load.v128i32.p0(ptr %in.vec, i32 64, <128 x i1> %tgt.mask, <128 x i32> poison) 13119 store <128 x i32> %data, ptr %out.vec, align 64 13120 ret void 13121} 13122 13123define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 13124; AVX512F-ONLY-LABEL: mask_replication_factor8_vf32: 13125; AVX512F-ONLY: # %bb.0: 13126; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 13127; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 13128; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 13129; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] 13130; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm2, %zmm0 13131; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] 13132; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm4 13133; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] 13134; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm6 13135; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] 13136; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm7, %zmm8 13137; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] 13138; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm9, %zmm10 13139; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] 13140; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm11, %zmm12 13141; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] 13142; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm13, %zmm14 13143; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] 13144; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm15, %zmm1 13145; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm16 {%k1} {z} = -1 13146; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm2, %zmm2 13147; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm3, %zmm3 13148; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm5, %zmm5 13149; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm7, %zmm7 13150; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm9, %zmm9 13151; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm11, %zmm11 13152; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm13, %zmm13 13153; AVX512F-ONLY-NEXT: vpermd %zmm16, %zmm15, %zmm15 13154; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 13155; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm15 {%k1} {z} 13156; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 13157; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm13 {%k1} {z} 13158; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 13159; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm11 {%k1} {z} 13160; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 13161; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} 13162; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 13163; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z} 13164; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 13165; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} 13166; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 13167; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm3 {%k1} {z} 13168; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 13169; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z} 13170; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 13171; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm1 {%k1} {z} 13172; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 13173; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm14 {%k1} {z} 13174; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 13175; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm12 {%k1} {z} 13176; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 13177; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k1} {z} 13178; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 13179; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm8 {%k1} {z} 13180; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 13181; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm6 {%k1} {z} 13182; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 13183; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm4 {%k1} {z} 13184; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 13185; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm0 {%k1} {z} 13186; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 960(%rdx) 13187; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 896(%rdx) 13188; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 832(%rdx) 13189; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 768(%rdx) 13190; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 704(%rdx) 13191; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 640(%rdx) 13192; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 576(%rdx) 13193; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 512(%rdx) 13194; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 448(%rdx) 13195; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 384(%rdx) 13196; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) 13197; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 256(%rdx) 13198; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 192(%rdx) 13199; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 128(%rdx) 13200; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 64(%rdx) 13201; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, (%rdx) 13202; AVX512F-ONLY-NEXT: vzeroupper 13203; AVX512F-ONLY-NEXT: retq 13204; 13205; AVX512DQ-LABEL: mask_replication_factor8_vf32: 13206; AVX512DQ: # %bb.0: 13207; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 13208; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 13209; AVX512DQ-NEXT: kmovw (%rdi), %k0 13210; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] 13211; AVX512DQ-NEXT: vpermd %zmm1, %zmm2, %zmm0 13212; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] 13213; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm4 13214; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] 13215; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm6 13216; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] 13217; AVX512DQ-NEXT: vpermd %zmm1, %zmm7, %zmm8 13218; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] 13219; AVX512DQ-NEXT: vpermd %zmm1, %zmm9, %zmm10 13220; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] 13221; AVX512DQ-NEXT: vpermd %zmm1, %zmm11, %zmm12 13222; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] 13223; AVX512DQ-NEXT: vpermd %zmm1, %zmm13, %zmm14 13224; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] 13225; AVX512DQ-NEXT: vpermd %zmm1, %zmm15, %zmm1 13226; AVX512DQ-NEXT: vpmovm2d %k0, %zmm16 13227; AVX512DQ-NEXT: vpermd %zmm16, %zmm2, %zmm2 13228; AVX512DQ-NEXT: vpermd %zmm16, %zmm3, %zmm3 13229; AVX512DQ-NEXT: vpermd %zmm16, %zmm5, %zmm5 13230; AVX512DQ-NEXT: vpermd %zmm16, %zmm7, %zmm7 13231; AVX512DQ-NEXT: vpermd %zmm16, %zmm9, %zmm9 13232; AVX512DQ-NEXT: vpermd %zmm16, %zmm11, %zmm11 13233; AVX512DQ-NEXT: vpermd %zmm16, %zmm13, %zmm13 13234; AVX512DQ-NEXT: vpermd %zmm16, %zmm15, %zmm15 13235; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 13236; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm15 {%k1} {z} 13237; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 13238; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm13 {%k1} {z} 13239; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 13240; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm11 {%k1} {z} 13241; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 13242; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} 13243; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 13244; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z} 13245; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 13246; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} 13247; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 13248; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm3 {%k1} {z} 13249; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 13250; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z} 13251; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 13252; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm1 {%k1} {z} 13253; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 13254; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm14 {%k1} {z} 13255; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 13256; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm12 {%k1} {z} 13257; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 13258; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k1} {z} 13259; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 13260; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm8 {%k1} {z} 13261; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 13262; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm6 {%k1} {z} 13263; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 13264; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm4 {%k1} {z} 13265; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 13266; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm0 {%k1} {z} 13267; AVX512DQ-NEXT: vmovdqa64 %zmm0, 960(%rdx) 13268; AVX512DQ-NEXT: vmovdqa64 %zmm4, 896(%rdx) 13269; AVX512DQ-NEXT: vmovdqa64 %zmm6, 832(%rdx) 13270; AVX512DQ-NEXT: vmovdqa64 %zmm8, 768(%rdx) 13271; AVX512DQ-NEXT: vmovdqa64 %zmm10, 704(%rdx) 13272; AVX512DQ-NEXT: vmovdqa64 %zmm12, 640(%rdx) 13273; AVX512DQ-NEXT: vmovdqa64 %zmm14, 576(%rdx) 13274; AVX512DQ-NEXT: vmovdqa64 %zmm1, 512(%rdx) 13275; AVX512DQ-NEXT: vmovdqa64 %zmm2, 448(%rdx) 13276; AVX512DQ-NEXT: vmovdqa64 %zmm3, 384(%rdx) 13277; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rdx) 13278; AVX512DQ-NEXT: vmovdqa64 %zmm7, 256(%rdx) 13279; AVX512DQ-NEXT: vmovdqa64 %zmm9, 192(%rdx) 13280; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%rdx) 13281; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%rdx) 13282; AVX512DQ-NEXT: vmovdqa64 %zmm15, (%rdx) 13283; AVX512DQ-NEXT: vzeroupper 13284; AVX512DQ-NEXT: retq 13285; 13286; AVX512BW-LABEL: mask_replication_factor8_vf32: 13287; AVX512BW: # %bb.0: 13288; AVX512BW-NEXT: kmovd (%rdi), %k0 13289; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 13290; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] 13291; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] 13292; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm3 13293; AVX512BW-NEXT: vpmovb2m %zmm3, %k1 13294; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] 13295; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1 13296; AVX512BW-NEXT: vpmovb2m %zmm1, %k2 13297; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] 13298; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm1 13299; AVX512BW-NEXT: vpmovb2m %zmm1, %k3 13300; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 13301; AVX512BW-NEXT: vpmovb2m %zmm0, %k4 13302; AVX512BW-NEXT: kshiftrq $16, %k4, %k5 13303; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} 13304; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} 13305; AVX512BW-NEXT: kshiftrq $48, %k4, %k5 13306; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z} 13307; AVX512BW-NEXT: kshiftrq $32, %k4, %k4 13308; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z} 13309; AVX512BW-NEXT: kshiftrq $16, %k3, %k4 13310; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} 13311; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} 13312; AVX512BW-NEXT: kshiftrq $48, %k3, %k4 13313; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z} 13314; AVX512BW-NEXT: kshiftrq $32, %k3, %k3 13315; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z} 13316; AVX512BW-NEXT: kshiftrq $16, %k2, %k3 13317; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z} 13318; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z} 13319; AVX512BW-NEXT: kshiftrq $48, %k2, %k3 13320; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z} 13321; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 13322; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z} 13323; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 13324; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z} 13325; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z} 13326; AVX512BW-NEXT: kshiftrq $48, %k1, %k2 13327; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z} 13328; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 13329; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z} 13330; AVX512BW-NEXT: vmovdqa64 %zmm15, 896(%rdx) 13331; AVX512BW-NEXT: vmovdqa64 %zmm14, 960(%rdx) 13332; AVX512BW-NEXT: vmovdqa64 %zmm13, 768(%rdx) 13333; AVX512BW-NEXT: vmovdqa64 %zmm12, 832(%rdx) 13334; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rdx) 13335; AVX512BW-NEXT: vmovdqa64 %zmm10, 704(%rdx) 13336; AVX512BW-NEXT: vmovdqa64 %zmm9, 512(%rdx) 13337; AVX512BW-NEXT: vmovdqa64 %zmm8, 576(%rdx) 13338; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rdx) 13339; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%rdx) 13340; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rdx) 13341; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rdx) 13342; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx) 13343; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rdx) 13344; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) 13345; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) 13346; AVX512BW-NEXT: vzeroupper 13347; AVX512BW-NEXT: retq 13348 %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 13349 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 13350 %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <256 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> 13351 %data = call <256 x i32> @llvm.masked.load.v256i32.p0(ptr %in.vec, i32 64, <256 x i1> %tgt.mask, <256 x i32> poison) 13352 store <256 x i32> %data, ptr %out.vec, align 64 13353 ret void 13354} 13355 13356define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { 13357; AVX512F-ONLY-LABEL: mask_replication_factor8_vf64: 13358; AVX512F-ONLY: # %bb.0: 13359; AVX512F-ONLY-NEXT: subq $136, %rsp 13360; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 13361; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm6 {%k1} {z} = -1 13362; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 13363; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm8 {%k1} {z} = -1 13364; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 13365; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm10 {%k1} {z} = -1 13366; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 13367; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm12 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] 13368; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm12, %zmm0 13369; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13370; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm14 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] 13371; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm14, %zmm0 13372; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill 13373; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm16 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] 13374; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm16, %zmm0 13375; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13376; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm18 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] 13377; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm18, %zmm0 13378; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13379; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm20 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] 13380; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm20, %zmm4 13381; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm22 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] 13382; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm22, %zmm5 13383; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm24 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] 13384; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm24, %zmm7 13385; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] 13386; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm26, %zmm9 13387; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm12, %zmm11 13388; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm14, %zmm13 13389; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm16, %zmm15 13390; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm18, %zmm17 13391; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm20, %zmm19 13392; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm22, %zmm21 13393; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm24, %zmm23 13394; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm26, %zmm25 13395; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm12, %zmm27 13396; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm14, %zmm28 13397; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm16, %zmm29 13398; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm18, %zmm30 13399; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm20, %zmm31 13400; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm22, %zmm3 13401; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm24, %zmm6 13402; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm26, %zmm2 13403; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm8 {%k1} {z} = -1 13404; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm12, %zmm1 13405; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm14, %zmm0 13406; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm16, %zmm16 13407; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm18, %zmm14 13408; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm20, %zmm12 13409; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm22, %zmm10 13410; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm24, %zmm18 13411; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm26, %zmm8 13412; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 13413; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm26 {%k1} {z} 13414; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 13415; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm8 {%k1} {z} 13416; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 13417; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k1} {z} 13418; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 13419; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z} 13420; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 13421; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm14 {%k1} {z} 13422; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1 13423; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm16 {%k1} {z} 13424; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 13425; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm18 {%k1} {z} 13426; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 13427; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm20 {%k1} {z} 13428; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 13429; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm22 {%k1} {z} 13430; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 13431; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm24 {%k1} {z} 13432; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 13433; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm0 {%k1} {z} 13434; AVX512F-ONLY-NEXT: vptestmd %zmm31, %zmm31, %k1 13435; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z} 13436; AVX512F-ONLY-NEXT: vptestmd %zmm30, %zmm30, %k1 13437; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm2 {%k1} {z} 13438; AVX512F-ONLY-NEXT: vptestmd %zmm29, %zmm29, %k1 13439; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm3 {%k1} {z} 13440; AVX512F-ONLY-NEXT: vptestmd %zmm28, %zmm28, %k1 13441; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm6 {%k1} {z} 13442; AVX512F-ONLY-NEXT: vptestmd %zmm27, %zmm27, %k1 13443; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm27 {%k1} {z} 13444; AVX512F-ONLY-NEXT: vptestmd %zmm25, %zmm25, %k1 13445; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm25 {%k1} {z} 13446; AVX512F-ONLY-NEXT: vptestmd %zmm23, %zmm23, %k1 13447; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm23 {%k1} {z} 13448; AVX512F-ONLY-NEXT: vptestmd %zmm21, %zmm21, %k1 13449; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm21 {%k1} {z} 13450; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1 13451; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z} 13452; AVX512F-ONLY-NEXT: vptestmd %zmm17, %zmm17, %k1 13453; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm17 {%k1} {z} 13454; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 13455; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm15 {%k1} {z} 13456; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 13457; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm13 {%k1} {z} 13458; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 13459; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm11 {%k1} {z} 13460; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 13461; AVX512F-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm9 {%k1} {z} 13462; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 13463; AVX512F-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm7 {%k1} {z} 13464; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 13465; AVX512F-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm5 {%k1} {z} 13466; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 13467; AVX512F-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm4 {%k1} {z} 13468; AVX512F-ONLY-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 13469; AVX512F-ONLY-NEXT: vptestmd %zmm28, %zmm28, %k1 13470; AVX512F-ONLY-NEXT: vmovdqa32 1792(%rsi), %zmm28 {%k1} {z} 13471; AVX512F-ONLY-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload 13472; AVX512F-ONLY-NEXT: vptestmd %zmm29, %zmm29, %k1 13473; AVX512F-ONLY-NEXT: vmovdqa32 1856(%rsi), %zmm29 {%k1} {z} 13474; AVX512F-ONLY-NEXT: vmovdqu64 (%rsp), %zmm30 # 64-byte Reload 13475; AVX512F-ONLY-NEXT: vptestmd %zmm30, %zmm30, %k1 13476; AVX512F-ONLY-NEXT: vmovdqa32 1920(%rsi), %zmm30 {%k1} {z} 13477; AVX512F-ONLY-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 13478; AVX512F-ONLY-NEXT: vptestmd %zmm31, %zmm31, %k1 13479; AVX512F-ONLY-NEXT: vmovdqa32 1984(%rsi), %zmm31 {%k1} {z} 13480; AVX512F-ONLY-NEXT: vmovdqa64 %zmm31, 1984(%rdx) 13481; AVX512F-ONLY-NEXT: vmovdqa64 %zmm30, 1920(%rdx) 13482; AVX512F-ONLY-NEXT: vmovdqa64 %zmm29, 1856(%rdx) 13483; AVX512F-ONLY-NEXT: vmovdqa64 %zmm28, 1792(%rdx) 13484; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 1728(%rdx) 13485; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 1664(%rdx) 13486; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 1600(%rdx) 13487; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 1536(%rdx) 13488; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 1472(%rdx) 13489; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 1408(%rdx) 13490; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 1344(%rdx) 13491; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 1280(%rdx) 13492; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 1216(%rdx) 13493; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 1152(%rdx) 13494; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 1088(%rdx) 13495; AVX512F-ONLY-NEXT: vmovdqa64 %zmm25, 1024(%rdx) 13496; AVX512F-ONLY-NEXT: vmovdqa64 %zmm27, 960(%rdx) 13497; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 896(%rdx) 13498; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 832(%rdx) 13499; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 768(%rdx) 13500; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 704(%rdx) 13501; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 640(%rdx) 13502; AVX512F-ONLY-NEXT: vmovdqa64 %zmm24, 576(%rdx) 13503; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 512(%rdx) 13504; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 448(%rdx) 13505; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 384(%rdx) 13506; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 320(%rdx) 13507; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 256(%rdx) 13508; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 192(%rdx) 13509; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 128(%rdx) 13510; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 64(%rdx) 13511; AVX512F-ONLY-NEXT: vmovdqa64 %zmm26, (%rdx) 13512; AVX512F-ONLY-NEXT: addq $136, %rsp 13513; AVX512F-ONLY-NEXT: vzeroupper 13514; AVX512F-ONLY-NEXT: retq 13515; 13516; AVX512DQ-LABEL: mask_replication_factor8_vf64: 13517; AVX512DQ: # %bb.0: 13518; AVX512DQ-NEXT: subq $136, %rsp 13519; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 13520; AVX512DQ-NEXT: vpmovm2d %k0, %zmm6 13521; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 13522; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8 13523; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 13524; AVX512DQ-NEXT: vpmovm2d %k0, %zmm10 13525; AVX512DQ-NEXT: kmovw (%rdi), %k0 13526; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] 13527; AVX512DQ-NEXT: vpermd %zmm6, %zmm12, %zmm0 13528; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13529; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] 13530; AVX512DQ-NEXT: vpermd %zmm6, %zmm14, %zmm0 13531; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill 13532; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] 13533; AVX512DQ-NEXT: vpermd %zmm6, %zmm16, %zmm0 13534; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13535; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] 13536; AVX512DQ-NEXT: vpermd %zmm6, %zmm18, %zmm0 13537; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13538; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] 13539; AVX512DQ-NEXT: vpermd %zmm6, %zmm20, %zmm4 13540; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm22 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] 13541; AVX512DQ-NEXT: vpermd %zmm6, %zmm22, %zmm5 13542; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm24 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] 13543; AVX512DQ-NEXT: vpermd %zmm6, %zmm24, %zmm7 13544; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] 13545; AVX512DQ-NEXT: vpermd %zmm6, %zmm26, %zmm9 13546; AVX512DQ-NEXT: vpermd %zmm8, %zmm12, %zmm11 13547; AVX512DQ-NEXT: vpermd %zmm8, %zmm14, %zmm13 13548; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm15 13549; AVX512DQ-NEXT: vpermd %zmm8, %zmm18, %zmm17 13550; AVX512DQ-NEXT: vpermd %zmm8, %zmm20, %zmm19 13551; AVX512DQ-NEXT: vpermd %zmm8, %zmm22, %zmm21 13552; AVX512DQ-NEXT: vpermd %zmm8, %zmm24, %zmm23 13553; AVX512DQ-NEXT: vpermd %zmm8, %zmm26, %zmm25 13554; AVX512DQ-NEXT: vpermd %zmm10, %zmm12, %zmm27 13555; AVX512DQ-NEXT: vpermd %zmm10, %zmm14, %zmm28 13556; AVX512DQ-NEXT: vpermd %zmm10, %zmm16, %zmm29 13557; AVX512DQ-NEXT: vpermd %zmm10, %zmm18, %zmm30 13558; AVX512DQ-NEXT: vpermd %zmm10, %zmm20, %zmm31 13559; AVX512DQ-NEXT: vpermd %zmm10, %zmm22, %zmm3 13560; AVX512DQ-NEXT: vpermd %zmm10, %zmm24, %zmm6 13561; AVX512DQ-NEXT: vpermd %zmm10, %zmm26, %zmm2 13562; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8 13563; AVX512DQ-NEXT: vpermd %zmm8, %zmm12, %zmm1 13564; AVX512DQ-NEXT: vpermd %zmm8, %zmm14, %zmm0 13565; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm16 13566; AVX512DQ-NEXT: vpermd %zmm8, %zmm18, %zmm14 13567; AVX512DQ-NEXT: vpermd %zmm8, %zmm20, %zmm12 13568; AVX512DQ-NEXT: vpermd %zmm8, %zmm22, %zmm10 13569; AVX512DQ-NEXT: vpermd %zmm8, %zmm24, %zmm18 13570; AVX512DQ-NEXT: vpermd %zmm8, %zmm26, %zmm8 13571; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 13572; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm26 {%k1} {z} 13573; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 13574; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm8 {%k1} {z} 13575; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 13576; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k1} {z} 13577; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 13578; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z} 13579; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 13580; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm14 {%k1} {z} 13581; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1 13582; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm16 {%k1} {z} 13583; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 13584; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm18 {%k1} {z} 13585; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 13586; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm20 {%k1} {z} 13587; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 13588; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm22 {%k1} {z} 13589; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 13590; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm24 {%k1} {z} 13591; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 13592; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm0 {%k1} {z} 13593; AVX512DQ-NEXT: vpmovd2m %zmm31, %k1 13594; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z} 13595; AVX512DQ-NEXT: vpmovd2m %zmm30, %k1 13596; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm2 {%k1} {z} 13597; AVX512DQ-NEXT: vpmovd2m %zmm29, %k1 13598; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm3 {%k1} {z} 13599; AVX512DQ-NEXT: vpmovd2m %zmm28, %k1 13600; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm6 {%k1} {z} 13601; AVX512DQ-NEXT: vpmovd2m %zmm27, %k1 13602; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm27 {%k1} {z} 13603; AVX512DQ-NEXT: vpmovd2m %zmm25, %k1 13604; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm25 {%k1} {z} 13605; AVX512DQ-NEXT: vpmovd2m %zmm23, %k1 13606; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm23 {%k1} {z} 13607; AVX512DQ-NEXT: vpmovd2m %zmm21, %k1 13608; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm21 {%k1} {z} 13609; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1 13610; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z} 13611; AVX512DQ-NEXT: vpmovd2m %zmm17, %k1 13612; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm17 {%k1} {z} 13613; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 13614; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm15 {%k1} {z} 13615; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 13616; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm13 {%k1} {z} 13617; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 13618; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm11 {%k1} {z} 13619; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 13620; AVX512DQ-NEXT: vmovdqa32 1536(%rsi), %zmm9 {%k1} {z} 13621; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 13622; AVX512DQ-NEXT: vmovdqa32 1600(%rsi), %zmm7 {%k1} {z} 13623; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 13624; AVX512DQ-NEXT: vmovdqa32 1664(%rsi), %zmm5 {%k1} {z} 13625; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 13626; AVX512DQ-NEXT: vmovdqa32 1728(%rsi), %zmm4 {%k1} {z} 13627; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload 13628; AVX512DQ-NEXT: vpmovd2m %zmm28, %k1 13629; AVX512DQ-NEXT: vmovdqa32 1792(%rsi), %zmm28 {%k1} {z} 13630; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload 13631; AVX512DQ-NEXT: vpmovd2m %zmm29, %k1 13632; AVX512DQ-NEXT: vmovdqa32 1856(%rsi), %zmm29 {%k1} {z} 13633; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm30 # 64-byte Reload 13634; AVX512DQ-NEXT: vpmovd2m %zmm30, %k1 13635; AVX512DQ-NEXT: vmovdqa32 1920(%rsi), %zmm30 {%k1} {z} 13636; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload 13637; AVX512DQ-NEXT: vpmovd2m %zmm31, %k1 13638; AVX512DQ-NEXT: vmovdqa32 1984(%rsi), %zmm31 {%k1} {z} 13639; AVX512DQ-NEXT: vmovdqa64 %zmm31, 1984(%rdx) 13640; AVX512DQ-NEXT: vmovdqa64 %zmm30, 1920(%rdx) 13641; AVX512DQ-NEXT: vmovdqa64 %zmm29, 1856(%rdx) 13642; AVX512DQ-NEXT: vmovdqa64 %zmm28, 1792(%rdx) 13643; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1728(%rdx) 13644; AVX512DQ-NEXT: vmovdqa64 %zmm5, 1664(%rdx) 13645; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1600(%rdx) 13646; AVX512DQ-NEXT: vmovdqa64 %zmm9, 1536(%rdx) 13647; AVX512DQ-NEXT: vmovdqa64 %zmm11, 1472(%rdx) 13648; AVX512DQ-NEXT: vmovdqa64 %zmm13, 1408(%rdx) 13649; AVX512DQ-NEXT: vmovdqa64 %zmm15, 1344(%rdx) 13650; AVX512DQ-NEXT: vmovdqa64 %zmm17, 1280(%rdx) 13651; AVX512DQ-NEXT: vmovdqa64 %zmm19, 1216(%rdx) 13652; AVX512DQ-NEXT: vmovdqa64 %zmm21, 1152(%rdx) 13653; AVX512DQ-NEXT: vmovdqa64 %zmm23, 1088(%rdx) 13654; AVX512DQ-NEXT: vmovdqa64 %zmm25, 1024(%rdx) 13655; AVX512DQ-NEXT: vmovdqa64 %zmm27, 960(%rdx) 13656; AVX512DQ-NEXT: vmovdqa64 %zmm6, 896(%rdx) 13657; AVX512DQ-NEXT: vmovdqa64 %zmm3, 832(%rdx) 13658; AVX512DQ-NEXT: vmovdqa64 %zmm2, 768(%rdx) 13659; AVX512DQ-NEXT: vmovdqa64 %zmm1, 704(%rdx) 13660; AVX512DQ-NEXT: vmovdqa64 %zmm0, 640(%rdx) 13661; AVX512DQ-NEXT: vmovdqa64 %zmm24, 576(%rdx) 13662; AVX512DQ-NEXT: vmovdqa64 %zmm22, 512(%rdx) 13663; AVX512DQ-NEXT: vmovdqa64 %zmm20, 448(%rdx) 13664; AVX512DQ-NEXT: vmovdqa64 %zmm18, 384(%rdx) 13665; AVX512DQ-NEXT: vmovdqa64 %zmm16, 320(%rdx) 13666; AVX512DQ-NEXT: vmovdqa64 %zmm14, 256(%rdx) 13667; AVX512DQ-NEXT: vmovdqa64 %zmm12, 192(%rdx) 13668; AVX512DQ-NEXT: vmovdqa64 %zmm10, 128(%rdx) 13669; AVX512DQ-NEXT: vmovdqa64 %zmm8, 64(%rdx) 13670; AVX512DQ-NEXT: vmovdqa64 %zmm26, (%rdx) 13671; AVX512DQ-NEXT: addq $136, %rsp 13672; AVX512DQ-NEXT: vzeroupper 13673; AVX512DQ-NEXT: retq 13674; 13675; AVX512BW-LABEL: mask_replication_factor8_vf64: 13676; AVX512BW: # %bb.0: 13677; AVX512BW-NEXT: kmovq (%rdi), %k0 13678; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 13679; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[6,7,6,7,6,7,6,7] 13680; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] 13681; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm7 13682; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] 13683; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm12 13684; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5] 13685; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm15 13686; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm16 13687; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] 13688; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm10 13689; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm5 13690; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] 13691; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm1 13692; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 13693; AVX512BW-NEXT: vpmovb2m %zmm0, %k2 13694; AVX512BW-NEXT: kshiftrq $16, %k2, %k1 13695; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} 13696; AVX512BW-NEXT: kshiftrq $32, %k2, %k1 13697; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} 13698; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 13699; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} 13700; AVX512BW-NEXT: kshiftrq $48, %k2, %k2 13701; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} 13702; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 13703; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} 13704; AVX512BW-NEXT: kshiftrq $32, %k1, %k2 13705; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} 13706; AVX512BW-NEXT: vpmovb2m %zmm5, %k2 13707; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} 13708; AVX512BW-NEXT: kshiftrq $48, %k1, %k1 13709; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm8 {%k1} {z} 13710; AVX512BW-NEXT: kshiftrq $16, %k2, %k1 13711; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} 13712; AVX512BW-NEXT: kshiftrq $32, %k2, %k1 13713; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k1} {z} 13714; AVX512BW-NEXT: vpmovb2m %zmm10, %k1 13715; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm10 {%k2} {z} 13716; AVX512BW-NEXT: kshiftrq $48, %k2, %k2 13717; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm13 {%k2} {z} 13718; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 13719; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k2} {z} 13720; AVX512BW-NEXT: kshiftrq $32, %k1, %k2 13721; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm17 {%k2} {z} 13722; AVX512BW-NEXT: vpmovb2m %zmm16, %k2 13723; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm16 {%k1} {z} 13724; AVX512BW-NEXT: kshiftrq $48, %k1, %k1 13725; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm18 {%k1} {z} 13726; AVX512BW-NEXT: kshiftrq $16, %k2, %k1 13727; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm19 {%k1} {z} 13728; AVX512BW-NEXT: kshiftrq $32, %k2, %k1 13729; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm20 {%k1} {z} 13730; AVX512BW-NEXT: vpmovb2m %zmm15, %k1 13731; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm15 {%k2} {z} 13732; AVX512BW-NEXT: kshiftrq $48, %k2, %k2 13733; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm21 {%k2} {z} 13734; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 13735; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm22 {%k2} {z} 13736; AVX512BW-NEXT: kshiftrq $32, %k1, %k2 13737; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm23 {%k2} {z} 13738; AVX512BW-NEXT: vpmovb2m %zmm12, %k2 13739; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k1} {z} 13740; AVX512BW-NEXT: kshiftrq $48, %k1, %k1 13741; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm24 {%k1} {z} 13742; AVX512BW-NEXT: kshiftrq $16, %k2, %k1 13743; AVX512BW-NEXT: vmovdqa32 1600(%rsi), %zmm25 {%k1} {z} 13744; AVX512BW-NEXT: kshiftrq $32, %k2, %k1 13745; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k1} {z} 13746; AVX512BW-NEXT: vpmovb2m %zmm7, %k1 13747; AVX512BW-NEXT: vmovdqa32 1536(%rsi), %zmm7 {%k2} {z} 13748; AVX512BW-NEXT: kshiftrq $48, %k2, %k2 13749; AVX512BW-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k2} {z} 13750; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 13751; AVX512BW-NEXT: vmovdqa32 1856(%rsi), %zmm28 {%k2} {z} 13752; AVX512BW-NEXT: kshiftrq $32, %k1, %k2 13753; AVX512BW-NEXT: vmovdqa32 1920(%rsi), %zmm29 {%k2} {z} 13754; AVX512BW-NEXT: vmovdqa32 1792(%rsi), %zmm30 {%k1} {z} 13755; AVX512BW-NEXT: kshiftrq $48, %k1, %k1 13756; AVX512BW-NEXT: vmovdqa32 1984(%rsi), %zmm31 {%k1} {z} 13757; AVX512BW-NEXT: vmovdqa64 %zmm31, 1984(%rdx) 13758; AVX512BW-NEXT: vmovdqa64 %zmm29, 1920(%rdx) 13759; AVX512BW-NEXT: vmovdqa64 %zmm28, 1856(%rdx) 13760; AVX512BW-NEXT: vmovdqa64 %zmm30, 1792(%rdx) 13761; AVX512BW-NEXT: vmovdqa64 %zmm27, 1728(%rdx) 13762; AVX512BW-NEXT: vmovdqa64 %zmm26, 1664(%rdx) 13763; AVX512BW-NEXT: vmovdqa64 %zmm25, 1600(%rdx) 13764; AVX512BW-NEXT: vmovdqa64 %zmm7, 1536(%rdx) 13765; AVX512BW-NEXT: vmovdqa64 %zmm24, 1472(%rdx) 13766; AVX512BW-NEXT: vmovdqa64 %zmm23, 1408(%rdx) 13767; AVX512BW-NEXT: vmovdqa64 %zmm22, 1344(%rdx) 13768; AVX512BW-NEXT: vmovdqa64 %zmm12, 1280(%rdx) 13769; AVX512BW-NEXT: vmovdqa64 %zmm21, 1216(%rdx) 13770; AVX512BW-NEXT: vmovdqa64 %zmm20, 1152(%rdx) 13771; AVX512BW-NEXT: vmovdqa64 %zmm19, 1088(%rdx) 13772; AVX512BW-NEXT: vmovdqa64 %zmm15, 1024(%rdx) 13773; AVX512BW-NEXT: vmovdqa64 %zmm18, 960(%rdx) 13774; AVX512BW-NEXT: vmovdqa64 %zmm17, 896(%rdx) 13775; AVX512BW-NEXT: vmovdqa64 %zmm14, 832(%rdx) 13776; AVX512BW-NEXT: vmovdqa64 %zmm16, 768(%rdx) 13777; AVX512BW-NEXT: vmovdqa64 %zmm13, 704(%rdx) 13778; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rdx) 13779; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx) 13780; AVX512BW-NEXT: vmovdqa64 %zmm10, 512(%rdx) 13781; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%rdx) 13782; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) 13783; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rdx) 13784; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rdx) 13785; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) 13786; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) 13787; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) 13788; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) 13789; AVX512BW-NEXT: vzeroupper 13790; AVX512BW-NEXT: retq 13791 %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 13792 %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <512 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63> 13793 %data = call <512 x i32> @llvm.masked.load.v512i32.p0(ptr %in.vec, i32 64, <512 x i1> %tgt.mask, <512 x i32> poison) 13794 store <512 x i32> %data, ptr %out.vec, align 64 13795 ret void 13796} 13797 13798declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>) 13799declare <6 x i32> @llvm.masked.load.v6i32.p0(ptr, i32, <6 x i1>, <6 x i32>) 13800declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr, i32, <8 x i1>, <8 x i32>) 13801declare <10 x i32> @llvm.masked.load.v10i32.p0(ptr, i32, <10 x i1>, <10 x i32>) 13802declare <12 x i32> @llvm.masked.load.v12i32.p0(ptr, i32, <12 x i1>, <12 x i32>) 13803declare <14 x i32> @llvm.masked.load.v14i32.p0(ptr, i32, <14 x i1>, <14 x i32>) 13804declare <16 x i32> @llvm.masked.load.v16i32.p0(ptr, i32, <16 x i1>, <16 x i32>) 13805declare <20 x i32> @llvm.masked.load.v20i32.p0(ptr, i32, <20 x i1>, <20 x i32>) 13806declare <24 x i32> @llvm.masked.load.v24i32.p0(ptr, i32, <24 x i1>, <24 x i32>) 13807declare <28 x i32> @llvm.masked.load.v28i32.p0(ptr, i32, <28 x i1>, <28 x i32>) 13808declare <32 x i32> @llvm.masked.load.v32i32.p0(ptr, i32, <32 x i1>, <32 x i32>) 13809declare <40 x i32> @llvm.masked.load.v40i32.p0(ptr, i32, <40 x i1>, <40 x i32>) 13810declare <48 x i32> @llvm.masked.load.v48i32.p0(ptr, i32, <48 x i1>, <48 x i32>) 13811declare <56 x i32> @llvm.masked.load.v56i32.p0(ptr, i32, <56 x i1>, <56 x i32>) 13812declare <64 x i32> @llvm.masked.load.v64i32.p0(ptr, i32, <64 x i1>, <64 x i32>) 13813declare <80 x i32> @llvm.masked.load.v80i32.p0(ptr, i32, <80 x i1>, <80 x i32>) 13814declare <96 x i32> @llvm.masked.load.v96i32.p0(ptr, i32, <96 x i1>, <96 x i32>) 13815declare <112 x i32> @llvm.masked.load.v112i32.p0(ptr, i32, <112 x i1>, <112 x i32>) 13816declare <128 x i32> @llvm.masked.load.v128i32.p0(ptr, i32, <128 x i1>, <128 x i32>) 13817declare <160 x i32> @llvm.masked.load.v160i32.p0(ptr, i32, <160 x i1>, <160 x i32>) 13818declare <192 x i32> @llvm.masked.load.v192i32.p0(ptr, i32, <192 x i1>, <192 x i32>) 13819declare <224 x i32> @llvm.masked.load.v224i32.p0(ptr, i32, <224 x i1>, <224 x i32>) 13820declare <256 x i32> @llvm.masked.load.v256i32.p0(ptr, i32, <256 x i1>, <256 x i32>) 13821declare <320 x i32> @llvm.masked.load.v320i32.p0(ptr, i32, <320 x i1>, <320 x i32>) 13822declare <384 x i32> @llvm.masked.load.v384i32.p0(ptr, i32, <384 x i1>, <384 x i32>) 13823declare <448 x i32> @llvm.masked.load.v448i32.p0(ptr, i32, <448 x i1>, <448 x i32>) 13824declare <512 x i32> @llvm.masked.load.v512i32.p0(ptr, i32, <512 x i1>, <512 x i32>) 13825;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 13826; AVX512: {{.*}} 13827; FALLBACK0: {{.*}} 13828; FALLBACK1: {{.*}} 13829; FALLBACK2: {{.*}} 13830; FALLBACK3: {{.*}} 13831; FALLBACK4: {{.*}} 13832; FALLBACK5: {{.*}} 13833; FALLBACK6: {{.*}} 13834; FALLBACK7: {{.*}} 13835