1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ 10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP 11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW 12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP 13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW 14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP 15 16; These patterns are produced by LoopVectorizer for interleaved loads. 17 18define void @load_i8_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { 19; SSE-LABEL: load_i8_stride7_vf2: 20; SSE: # %bb.0: 21; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 22; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 23; SSE-NEXT: movdqa (%rdi), %xmm3 24; SSE-NEXT: pxor %xmm4, %xmm4 25; SSE-NEXT: movdqa %xmm3, %xmm2 26; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 27; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3] 28; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] 29; SSE-NEXT: packuswb %xmm1, %xmm1 30; SSE-NEXT: movdqa %xmm2, %xmm0 31; SSE-NEXT: psrld $16, %xmm0 32; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] 33; SSE-NEXT: movdqa %xmm0, %xmm4 34; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 35; SSE-NEXT: packuswb %xmm4, %xmm4 36; SSE-NEXT: movdqa %xmm2, %xmm6 37; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] 38; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,1,2,3] 39; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] 40; SSE-NEXT: packuswb %xmm5, %xmm5 41; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] 42; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] 43; SSE-NEXT: packuswb %xmm6, %xmm6 44; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] 45; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 46; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 47; SSE-NEXT: psrlq $48, %xmm3 48; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] 49; SSE-NEXT: packuswb %xmm7, %xmm7 50; SSE-NEXT: packuswb %xmm0, %xmm0 51; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] 52; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] 53; SSE-NEXT: packuswb %xmm2, %xmm2 54; SSE-NEXT: movd %xmm1, %edi 55; SSE-NEXT: movw %di, (%rsi) 56; SSE-NEXT: movd %xmm4, %esi 57; SSE-NEXT: movw %si, (%rdx) 58; SSE-NEXT: movd %xmm5, %edx 59; SSE-NEXT: movw %dx, (%rcx) 60; SSE-NEXT: movd %xmm6, %ecx 61; SSE-NEXT: movw %cx, (%r8) 62; SSE-NEXT: movd %xmm7, %ecx 63; SSE-NEXT: movw %cx, (%r9) 64; SSE-NEXT: movd %xmm0, %ecx 65; SSE-NEXT: movw %cx, (%r10) 66; SSE-NEXT: movd %xmm2, %ecx 67; SSE-NEXT: movw %cx, (%rax) 68; SSE-NEXT: retq 69; 70; AVX-LABEL: load_i8_stride7_vf2: 71; AVX: # %bb.0: 72; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 73; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 74; AVX-NEXT: vmovdqa (%rdi), %xmm0 75; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 76; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 77; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 78; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 79; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 80; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 81; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 82; AVX-NEXT: vpextrw $0, %xmm1, (%rsi) 83; AVX-NEXT: vpextrw $0, %xmm2, (%rdx) 84; AVX-NEXT: vpextrw $0, %xmm3, (%rcx) 85; AVX-NEXT: vpextrw $0, %xmm4, (%r8) 86; AVX-NEXT: vpextrw $0, %xmm5, (%r9) 87; AVX-NEXT: vpextrw $0, %xmm6, (%r10) 88; AVX-NEXT: vpextrw $0, %xmm0, (%rax) 89; AVX-NEXT: retq 90; 91; AVX2-LABEL: load_i8_stride7_vf2: 92; AVX2: # %bb.0: 93; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 94; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 95; AVX2-NEXT: vmovdqa (%rdi), %xmm0 96; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 97; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 98; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 99; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 100; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 101; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 102; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 103; AVX2-NEXT: vpextrw $0, %xmm1, (%rsi) 104; AVX2-NEXT: vpextrw $0, %xmm2, (%rdx) 105; AVX2-NEXT: vpextrw $0, %xmm3, (%rcx) 106; AVX2-NEXT: vpextrw $0, %xmm4, (%r8) 107; AVX2-NEXT: vpextrw $0, %xmm5, (%r9) 108; AVX2-NEXT: vpextrw $0, %xmm6, (%r10) 109; AVX2-NEXT: vpextrw $0, %xmm0, (%rax) 110; AVX2-NEXT: retq 111; 112; AVX2-FP-LABEL: load_i8_stride7_vf2: 113; AVX2-FP: # %bb.0: 114; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 115; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 116; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 117; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 118; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 119; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 120; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 121; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 122; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 123; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 124; AVX2-FP-NEXT: vpextrw $0, %xmm1, (%rsi) 125; AVX2-FP-NEXT: vpextrw $0, %xmm2, (%rdx) 126; AVX2-FP-NEXT: vpextrw $0, %xmm3, (%rcx) 127; AVX2-FP-NEXT: vpextrw $0, %xmm4, (%r8) 128; AVX2-FP-NEXT: vpextrw $0, %xmm5, (%r9) 129; AVX2-FP-NEXT: vpextrw $0, %xmm6, (%r10) 130; AVX2-FP-NEXT: vpextrw $0, %xmm0, (%rax) 131; AVX2-FP-NEXT: retq 132; 133; AVX2-FCP-LABEL: load_i8_stride7_vf2: 134; AVX2-FCP: # %bb.0: 135; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 136; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 137; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 138; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 139; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 140; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 141; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 142; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 143; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 144; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 145; AVX2-FCP-NEXT: vpextrw $0, %xmm1, (%rsi) 146; AVX2-FCP-NEXT: vpextrw $0, %xmm2, (%rdx) 147; AVX2-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) 148; AVX2-FCP-NEXT: vpextrw $0, %xmm4, (%r8) 149; AVX2-FCP-NEXT: vpextrw $0, %xmm5, (%r9) 150; AVX2-FCP-NEXT: vpextrw $0, %xmm6, (%r10) 151; AVX2-FCP-NEXT: vpextrw $0, %xmm0, (%rax) 152; AVX2-FCP-NEXT: retq 153; 154; AVX512-LABEL: load_i8_stride7_vf2: 155; AVX512: # %bb.0: 156; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 157; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 158; AVX512-NEXT: vmovdqa (%rdi), %xmm0 159; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 160; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 161; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 162; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 163; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 164; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 165; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 166; AVX512-NEXT: vpextrw $0, %xmm1, (%rsi) 167; AVX512-NEXT: vpextrw $0, %xmm2, (%rdx) 168; AVX512-NEXT: vpextrw $0, %xmm3, (%rcx) 169; AVX512-NEXT: vpextrw $0, %xmm4, (%r8) 170; AVX512-NEXT: vpextrw $0, %xmm5, (%r9) 171; AVX512-NEXT: vpextrw $0, %xmm6, (%r10) 172; AVX512-NEXT: vpextrw $0, %xmm0, (%rax) 173; AVX512-NEXT: retq 174; 175; AVX512-FCP-LABEL: load_i8_stride7_vf2: 176; AVX512-FCP: # %bb.0: 177; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 178; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 179; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 180; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 181; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 182; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 183; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 184; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 185; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 186; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 187; AVX512-FCP-NEXT: vpextrw $0, %xmm1, (%rsi) 188; AVX512-FCP-NEXT: vpextrw $0, %xmm2, (%rdx) 189; AVX512-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) 190; AVX512-FCP-NEXT: vpextrw $0, %xmm4, (%r8) 191; AVX512-FCP-NEXT: vpextrw $0, %xmm5, (%r9) 192; AVX512-FCP-NEXT: vpextrw $0, %xmm6, (%r10) 193; AVX512-FCP-NEXT: vpextrw $0, %xmm0, (%rax) 194; AVX512-FCP-NEXT: retq 195; 196; AVX512DQ-LABEL: load_i8_stride7_vf2: 197; AVX512DQ: # %bb.0: 198; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 199; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 200; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 201; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 202; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 203; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 204; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 205; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 206; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 207; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 208; AVX512DQ-NEXT: vpextrw $0, %xmm1, (%rsi) 209; AVX512DQ-NEXT: vpextrw $0, %xmm2, (%rdx) 210; AVX512DQ-NEXT: vpextrw $0, %xmm3, (%rcx) 211; AVX512DQ-NEXT: vpextrw $0, %xmm4, (%r8) 212; AVX512DQ-NEXT: vpextrw $0, %xmm5, (%r9) 213; AVX512DQ-NEXT: vpextrw $0, %xmm6, (%r10) 214; AVX512DQ-NEXT: vpextrw $0, %xmm0, (%rax) 215; AVX512DQ-NEXT: retq 216; 217; AVX512DQ-FCP-LABEL: load_i8_stride7_vf2: 218; AVX512DQ-FCP: # %bb.0: 219; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 220; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 221; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 222; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 223; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 224; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 225; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 226; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 227; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 228; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 229; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm1, (%rsi) 230; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm2, (%rdx) 231; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) 232; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm4, (%r8) 233; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm5, (%r9) 234; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm6, (%r10) 235; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm0, (%rax) 236; AVX512DQ-FCP-NEXT: retq 237; 238; AVX512BW-LABEL: load_i8_stride7_vf2: 239; AVX512BW: # %bb.0: 240; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 241; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 242; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 243; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 244; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 245; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 246; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 247; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 248; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 249; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 250; AVX512BW-NEXT: vpextrw $0, %xmm1, (%rsi) 251; AVX512BW-NEXT: vpextrw $0, %xmm2, (%rdx) 252; AVX512BW-NEXT: vpextrw $0, %xmm3, (%rcx) 253; AVX512BW-NEXT: vpextrw $0, %xmm4, (%r8) 254; AVX512BW-NEXT: vpextrw $0, %xmm5, (%r9) 255; AVX512BW-NEXT: vpextrw $0, %xmm6, (%r10) 256; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rax) 257; AVX512BW-NEXT: retq 258; 259; AVX512BW-FCP-LABEL: load_i8_stride7_vf2: 260; AVX512BW-FCP: # %bb.0: 261; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 262; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 263; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 264; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 265; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 266; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 267; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 268; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 269; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 270; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 271; AVX512BW-FCP-NEXT: vpextrw $0, %xmm1, (%rsi) 272; AVX512BW-FCP-NEXT: vpextrw $0, %xmm2, (%rdx) 273; AVX512BW-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) 274; AVX512BW-FCP-NEXT: vpextrw $0, %xmm4, (%r8) 275; AVX512BW-FCP-NEXT: vpextrw $0, %xmm5, (%r9) 276; AVX512BW-FCP-NEXT: vpextrw $0, %xmm6, (%r10) 277; AVX512BW-FCP-NEXT: vpextrw $0, %xmm0, (%rax) 278; AVX512BW-FCP-NEXT: retq 279; 280; AVX512DQ-BW-LABEL: load_i8_stride7_vf2: 281; AVX512DQ-BW: # %bb.0: 282; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 283; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 284; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 285; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 286; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 287; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 288; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 289; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 290; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 291; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 292; AVX512DQ-BW-NEXT: vpextrw $0, %xmm1, (%rsi) 293; AVX512DQ-BW-NEXT: vpextrw $0, %xmm2, (%rdx) 294; AVX512DQ-BW-NEXT: vpextrw $0, %xmm3, (%rcx) 295; AVX512DQ-BW-NEXT: vpextrw $0, %xmm4, (%r8) 296; AVX512DQ-BW-NEXT: vpextrw $0, %xmm5, (%r9) 297; AVX512DQ-BW-NEXT: vpextrw $0, %xmm6, (%r10) 298; AVX512DQ-BW-NEXT: vpextrw $0, %xmm0, (%rax) 299; AVX512DQ-BW-NEXT: retq 300; 301; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf2: 302; AVX512DQ-BW-FCP: # %bb.0: 303; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 304; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 305; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 306; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 307; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 308; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 309; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 310; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 311; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 312; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 313; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm1, (%rsi) 314; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm2, (%rdx) 315; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) 316; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm4, (%r8) 317; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm5, (%r9) 318; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm6, (%r10) 319; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm0, (%rax) 320; AVX512DQ-BW-FCP-NEXT: retq 321 %wide.vec = load <14 x i8>, ptr %in.vec, align 64 322 %strided.vec0 = shufflevector <14 x i8> %wide.vec, <14 x i8> poison, <2 x i32> <i32 0, i32 7> 323 %strided.vec1 = shufflevector <14 x i8> %wide.vec, <14 x i8> poison, <2 x i32> <i32 1, i32 8> 324 %strided.vec2 = shufflevector <14 x i8> %wide.vec, <14 x i8> poison, <2 x i32> <i32 2, i32 9> 325 %strided.vec3 = shufflevector <14 x i8> %wide.vec, <14 x i8> poison, <2 x i32> <i32 3, i32 10> 326 %strided.vec4 = shufflevector <14 x i8> %wide.vec, <14 x i8> poison, <2 x i32> <i32 4, i32 11> 327 %strided.vec5 = shufflevector <14 x i8> %wide.vec, <14 x i8> poison, <2 x i32> <i32 5, i32 12> 328 %strided.vec6 = shufflevector <14 x i8> %wide.vec, <14 x i8> poison, <2 x i32> <i32 6, i32 13> 329 store <2 x i8> %strided.vec0, ptr %out.vec0, align 64 330 store <2 x i8> %strided.vec1, ptr %out.vec1, align 64 331 store <2 x i8> %strided.vec2, ptr %out.vec2, align 64 332 store <2 x i8> %strided.vec3, ptr %out.vec3, align 64 333 store <2 x i8> %strided.vec4, ptr %out.vec4, align 64 334 store <2 x i8> %strided.vec5, ptr %out.vec5, align 64 335 store <2 x i8> %strided.vec6, ptr %out.vec6, align 64 336 ret void 337} 338 339define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { 340; SSE-LABEL: load_i8_stride7_vf4: 341; SSE: # %bb.0: 342; SSE-NEXT: movdqa (%rdi), %xmm4 343; SSE-NEXT: movdqa 16(%rdi), %xmm0 344; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,65535,65535,65535] 345; SSE-NEXT: movdqa %xmm4, %xmm1 346; SSE-NEXT: pand %xmm3, %xmm1 347; SSE-NEXT: pandn %xmm0, %xmm3 348; SSE-NEXT: por %xmm1, %xmm3 349; SSE-NEXT: pxor %xmm1, %xmm1 350; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 351; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,0,65535] 352; SSE-NEXT: pand %xmm2, %xmm3 353; SSE-NEXT: movdqa %xmm0, %xmm5 354; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[0,0] 355; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[2,3] 356; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,65535,65535] 357; SSE-NEXT: movdqa %xmm0, %xmm8 358; SSE-NEXT: pand %xmm7, %xmm8 359; SSE-NEXT: pandn %xmm4, %xmm7 360; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,0,65535,65535] 361; SSE-NEXT: movdqa %xmm0, %xmm9 362; SSE-NEXT: pand %xmm6, %xmm9 363; SSE-NEXT: pandn %xmm4, %xmm6 364; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,65535,65535,0,65535,65535] 365; SSE-NEXT: movdqa %xmm0, %xmm14 366; SSE-NEXT: pand %xmm13, %xmm14 367; SSE-NEXT: pandn %xmm4, %xmm13 368; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,3,2,3] 369; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,2,2,3] 370; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,65535,65535,0,65535] 371; SSE-NEXT: pand %xmm11, %xmm0 372; SSE-NEXT: pandn %xmm4, %xmm11 373; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 374; SSE-NEXT: pandn %xmm4, %xmm2 375; SSE-NEXT: por %xmm3, %xmm2 376; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,65535,65535,65535,65535,65535,65535,0] 377; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 378; SSE-NEXT: pand %xmm15, %xmm5 379; SSE-NEXT: pandn %xmm4, %xmm15 380; SSE-NEXT: por %xmm5, %xmm15 381; SSE-NEXT: por %xmm8, %xmm7 382; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] 383; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[0,2,2,3,4,5,6,7] 384; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] 385; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[1,1,0,3,4,5,6,7] 386; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] 387; SSE-NEXT: pand %xmm3, %xmm5 388; SSE-NEXT: pandn %xmm4, %xmm3 389; SSE-NEXT: por %xmm5, %xmm3 390; SSE-NEXT: por %xmm9, %xmm6 391; SSE-NEXT: movdqa %xmm6, %xmm4 392; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 393; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] 394; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] 395; SSE-NEXT: por %xmm14, %xmm13 396; SSE-NEXT: movdqa %xmm13, %xmm4 397; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 398; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[3,1,2,3,4,5,6,7] 399; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3],xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] 400; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,1,2,3] 401; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] 402; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 403; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 404; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi 405; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] 406; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] 407; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] 408; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] 409; SSE-NEXT: packuswb %xmm2, %xmm2 410; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,3,2,3] 411; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7] 412; SSE-NEXT: packuswb %xmm5, %xmm5 413; SSE-NEXT: packuswb %xmm3, %xmm3 414; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] 415; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] 416; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,0,2,3] 417; SSE-NEXT: packuswb %xmm6, %xmm6 418; SSE-NEXT: packuswb %xmm4, %xmm4 419; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] 420; SSE-NEXT: movdqa %xmm10, %xmm7 421; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] 422; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] 423; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] 424; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] 425; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[2,1,2,3] 426; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] 427; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] 428; SSE-NEXT: packuswb %xmm8, %xmm8 429; SSE-NEXT: por %xmm0, %xmm11 430; SSE-NEXT: movdqa %xmm11, %xmm0 431; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 432; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] 433; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15] 434; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,2,1] 435; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] 436; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 437; SSE-NEXT: packuswb %xmm0, %xmm0 438; SSE-NEXT: movd %xmm2, (%rsi) 439; SSE-NEXT: movd %xmm5, (%rdx) 440; SSE-NEXT: movd %xmm3, (%rcx) 441; SSE-NEXT: movd %xmm6, (%r8) 442; SSE-NEXT: movd %xmm4, (%r9) 443; SSE-NEXT: movd %xmm8, (%rdi) 444; SSE-NEXT: movd %xmm0, (%rax) 445; SSE-NEXT: retq 446; 447; AVX-LABEL: load_i8_stride7_vf4: 448; AVX: # %bb.0: 449; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 450; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 451; AVX-NEXT: vmovdqa (%rdi), %xmm0 452; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 453; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] 454; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 455; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 456; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] 457; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 458; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 459; AVX-NEXT: vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 460; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm5 461; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 462; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 463; AVX-NEXT: vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 464; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm7 465; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 466; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] 467; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm4 468; AVX-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 469; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm9 470; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] 471; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm6 472; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 473; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] 474; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm1 475; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 476; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 477; AVX-NEXT: vmovd %xmm2, (%rsi) 478; AVX-NEXT: vmovd %xmm3, (%rdx) 479; AVX-NEXT: vmovd %xmm5, (%rcx) 480; AVX-NEXT: vmovd %xmm7, (%r8) 481; AVX-NEXT: vmovd %xmm4, (%r9) 482; AVX-NEXT: vmovd %xmm6, (%r10) 483; AVX-NEXT: vmovd %xmm0, (%rax) 484; AVX-NEXT: retq 485; 486; AVX2-LABEL: load_i8_stride7_vf4: 487; AVX2: # %bb.0: 488; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 489; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 490; AVX2-NEXT: vmovdqa (%rdi), %xmm0 491; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 492; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] 493; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 494; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 495; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] 496; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 497; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 498; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] 499; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm5 500; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 501; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 502; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] 503; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm7 504; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 505; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] 506; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm4 507; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] 508; AVX2-NEXT: vpshufb %xmm8, %xmm0, %xmm9 509; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] 510; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm6 511; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 512; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] 513; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm1 514; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 515; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 516; AVX2-NEXT: vmovd %xmm2, (%rsi) 517; AVX2-NEXT: vmovd %xmm3, (%rdx) 518; AVX2-NEXT: vmovd %xmm5, (%rcx) 519; AVX2-NEXT: vmovd %xmm7, (%r8) 520; AVX2-NEXT: vmovd %xmm4, (%r9) 521; AVX2-NEXT: vmovd %xmm6, (%r10) 522; AVX2-NEXT: vmovd %xmm0, (%rax) 523; AVX2-NEXT: retq 524; 525; AVX2-FP-LABEL: load_i8_stride7_vf4: 526; AVX2-FP: # %bb.0: 527; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 528; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 529; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 530; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1 531; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] 532; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 533; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2 534; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] 535; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 536; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3 537; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] 538; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm5 539; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 540; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 541; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] 542; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm7 543; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 544; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] 545; AVX2-FP-NEXT: vpshufb %xmm4, %xmm1, %xmm4 546; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] 547; AVX2-FP-NEXT: vpshufb %xmm8, %xmm0, %xmm9 548; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] 549; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 550; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 551; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] 552; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 553; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 554; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 555; AVX2-FP-NEXT: vmovd %xmm2, (%rsi) 556; AVX2-FP-NEXT: vmovd %xmm3, (%rdx) 557; AVX2-FP-NEXT: vmovd %xmm5, (%rcx) 558; AVX2-FP-NEXT: vmovd %xmm7, (%r8) 559; AVX2-FP-NEXT: vmovd %xmm4, (%r9) 560; AVX2-FP-NEXT: vmovd %xmm6, (%r10) 561; AVX2-FP-NEXT: vmovd %xmm0, (%rax) 562; AVX2-FP-NEXT: retq 563; 564; AVX2-FCP-LABEL: load_i8_stride7_vf4: 565; AVX2-FCP: # %bb.0: 566; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 567; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 568; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 569; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 570; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] 571; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 572; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 573; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] 574; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 575; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 576; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] 577; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm5 578; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 579; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 580; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] 581; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm7 582; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 583; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] 584; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm4 585; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] 586; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9 587; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] 588; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 589; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 590; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] 591; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 592; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 593; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 594; AVX2-FCP-NEXT: vmovd %xmm2, (%rsi) 595; AVX2-FCP-NEXT: vmovd %xmm3, (%rdx) 596; AVX2-FCP-NEXT: vmovd %xmm5, (%rcx) 597; AVX2-FCP-NEXT: vmovd %xmm7, (%r8) 598; AVX2-FCP-NEXT: vmovd %xmm4, (%r9) 599; AVX2-FCP-NEXT: vmovd %xmm6, (%r10) 600; AVX2-FCP-NEXT: vmovd %xmm0, (%rax) 601; AVX2-FCP-NEXT: retq 602; 603; AVX512-LABEL: load_i8_stride7_vf4: 604; AVX512: # %bb.0: 605; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 606; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 607; AVX512-NEXT: vmovdqa (%rdi), %xmm0 608; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 609; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] 610; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 611; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 612; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] 613; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 614; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 615; AVX512-NEXT: vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 616; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm5 617; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 618; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 619; AVX512-NEXT: vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 620; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm7 621; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 622; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] 623; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm4 624; AVX512-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 625; AVX512-NEXT: vpshufb %xmm8, %xmm0, %xmm9 626; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] 627; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm6 628; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 629; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] 630; AVX512-NEXT: vpshufb %xmm8, %xmm1, %xmm1 631; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 632; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 633; AVX512-NEXT: vmovd %xmm2, (%rsi) 634; AVX512-NEXT: vmovd %xmm3, (%rdx) 635; AVX512-NEXT: vmovd %xmm5, (%rcx) 636; AVX512-NEXT: vmovd %xmm7, (%r8) 637; AVX512-NEXT: vmovd %xmm4, (%r9) 638; AVX512-NEXT: vmovd %xmm6, (%r10) 639; AVX512-NEXT: vmovd %xmm0, (%rax) 640; AVX512-NEXT: retq 641; 642; AVX512-FCP-LABEL: load_i8_stride7_vf4: 643; AVX512-FCP: # %bb.0: 644; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 645; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 646; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 647; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 648; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] 649; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 650; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 651; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] 652; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 653; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 654; AVX512-FCP-NEXT: vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 655; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm5 656; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 657; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 658; AVX512-FCP-NEXT: vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 659; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm7 660; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 661; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] 662; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm4 663; AVX512-FCP-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 664; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9 665; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] 666; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 667; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 668; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] 669; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 670; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 671; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 672; AVX512-FCP-NEXT: vmovd %xmm2, (%rsi) 673; AVX512-FCP-NEXT: vmovd %xmm3, (%rdx) 674; AVX512-FCP-NEXT: vmovd %xmm5, (%rcx) 675; AVX512-FCP-NEXT: vmovd %xmm7, (%r8) 676; AVX512-FCP-NEXT: vmovd %xmm4, (%r9) 677; AVX512-FCP-NEXT: vmovd %xmm6, (%r10) 678; AVX512-FCP-NEXT: vmovd %xmm0, (%rax) 679; AVX512-FCP-NEXT: retq 680; 681; AVX512DQ-LABEL: load_i8_stride7_vf4: 682; AVX512DQ: # %bb.0: 683; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 684; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 685; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 686; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 687; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] 688; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 689; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 690; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] 691; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 692; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 693; AVX512DQ-NEXT: vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 694; AVX512DQ-NEXT: vpshufb %xmm4, %xmm0, %xmm5 695; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 696; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 697; AVX512DQ-NEXT: vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 698; AVX512DQ-NEXT: vpshufb %xmm6, %xmm0, %xmm7 699; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 700; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] 701; AVX512DQ-NEXT: vpshufb %xmm4, %xmm1, %xmm4 702; AVX512DQ-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 703; AVX512DQ-NEXT: vpshufb %xmm8, %xmm0, %xmm9 704; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] 705; AVX512DQ-NEXT: vpshufb %xmm6, %xmm1, %xmm6 706; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 707; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] 708; AVX512DQ-NEXT: vpshufb %xmm8, %xmm1, %xmm1 709; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 710; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 711; AVX512DQ-NEXT: vmovd %xmm2, (%rsi) 712; AVX512DQ-NEXT: vmovd %xmm3, (%rdx) 713; AVX512DQ-NEXT: vmovd %xmm5, (%rcx) 714; AVX512DQ-NEXT: vmovd %xmm7, (%r8) 715; AVX512DQ-NEXT: vmovd %xmm4, (%r9) 716; AVX512DQ-NEXT: vmovd %xmm6, (%r10) 717; AVX512DQ-NEXT: vmovd %xmm0, (%rax) 718; AVX512DQ-NEXT: retq 719; 720; AVX512DQ-FCP-LABEL: load_i8_stride7_vf4: 721; AVX512DQ-FCP: # %bb.0: 722; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 723; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 724; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 725; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 726; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] 727; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 728; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 729; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] 730; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 731; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 732; AVX512DQ-FCP-NEXT: vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 733; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm5 734; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 735; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 736; AVX512DQ-FCP-NEXT: vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 737; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm7 738; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 739; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] 740; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm4 741; AVX512DQ-FCP-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 742; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9 743; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] 744; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 745; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 746; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] 747; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 748; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 749; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 750; AVX512DQ-FCP-NEXT: vmovd %xmm2, (%rsi) 751; AVX512DQ-FCP-NEXT: vmovd %xmm3, (%rdx) 752; AVX512DQ-FCP-NEXT: vmovd %xmm5, (%rcx) 753; AVX512DQ-FCP-NEXT: vmovd %xmm7, (%r8) 754; AVX512DQ-FCP-NEXT: vmovd %xmm4, (%r9) 755; AVX512DQ-FCP-NEXT: vmovd %xmm6, (%r10) 756; AVX512DQ-FCP-NEXT: vmovd %xmm0, (%rax) 757; AVX512DQ-FCP-NEXT: retq 758; 759; AVX512BW-LABEL: load_i8_stride7_vf4: 760; AVX512BW: # %bb.0: 761; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 762; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 763; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 764; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 765; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] 766; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 767; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 768; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] 769; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 770; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 771; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] 772; AVX512BW-NEXT: vpshufb %xmm4, %xmm0, %xmm5 773; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 774; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 775; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] 776; AVX512BW-NEXT: vpshufb %xmm6, %xmm0, %xmm7 777; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 778; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] 779; AVX512BW-NEXT: vpshufb %xmm4, %xmm1, %xmm4 780; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] 781; AVX512BW-NEXT: vpshufb %xmm8, %xmm0, %xmm9 782; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] 783; AVX512BW-NEXT: vpshufb %xmm6, %xmm1, %xmm6 784; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 785; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] 786; AVX512BW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 787; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 788; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 789; AVX512BW-NEXT: vmovd %xmm2, (%rsi) 790; AVX512BW-NEXT: vmovd %xmm3, (%rdx) 791; AVX512BW-NEXT: vmovd %xmm5, (%rcx) 792; AVX512BW-NEXT: vmovd %xmm7, (%r8) 793; AVX512BW-NEXT: vmovd %xmm4, (%r9) 794; AVX512BW-NEXT: vmovd %xmm6, (%r10) 795; AVX512BW-NEXT: vmovd %xmm0, (%rax) 796; AVX512BW-NEXT: retq 797; 798; AVX512BW-FCP-LABEL: load_i8_stride7_vf4: 799; AVX512BW-FCP: # %bb.0: 800; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 801; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 802; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 803; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 804; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] 805; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 806; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 807; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] 808; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 809; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 810; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] 811; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm5 812; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 813; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 814; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] 815; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm7 816; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 817; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] 818; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm4 819; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] 820; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9 821; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] 822; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 823; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 824; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] 825; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 826; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 827; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 828; AVX512BW-FCP-NEXT: vmovd %xmm2, (%rsi) 829; AVX512BW-FCP-NEXT: vmovd %xmm3, (%rdx) 830; AVX512BW-FCP-NEXT: vmovd %xmm5, (%rcx) 831; AVX512BW-FCP-NEXT: vmovd %xmm7, (%r8) 832; AVX512BW-FCP-NEXT: vmovd %xmm4, (%r9) 833; AVX512BW-FCP-NEXT: vmovd %xmm6, (%r10) 834; AVX512BW-FCP-NEXT: vmovd %xmm0, (%rax) 835; AVX512BW-FCP-NEXT: retq 836; 837; AVX512DQ-BW-LABEL: load_i8_stride7_vf4: 838; AVX512DQ-BW: # %bb.0: 839; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 840; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 841; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 842; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1 843; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] 844; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 845; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 846; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] 847; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 848; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 849; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] 850; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm0, %xmm5 851; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 852; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 853; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] 854; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm0, %xmm7 855; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 856; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] 857; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm1, %xmm4 858; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] 859; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm0, %xmm9 860; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] 861; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm1, %xmm6 862; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 863; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] 864; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 865; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 866; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 867; AVX512DQ-BW-NEXT: vmovd %xmm2, (%rsi) 868; AVX512DQ-BW-NEXT: vmovd %xmm3, (%rdx) 869; AVX512DQ-BW-NEXT: vmovd %xmm5, (%rcx) 870; AVX512DQ-BW-NEXT: vmovd %xmm7, (%r8) 871; AVX512DQ-BW-NEXT: vmovd %xmm4, (%r9) 872; AVX512DQ-BW-NEXT: vmovd %xmm6, (%r10) 873; AVX512DQ-BW-NEXT: vmovd %xmm0, (%rax) 874; AVX512DQ-BW-NEXT: retq 875; 876; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf4: 877; AVX512DQ-BW-FCP: # %bb.0: 878; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 879; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 880; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 881; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 882; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] 883; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 884; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 885; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] 886; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 887; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 888; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] 889; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm5 890; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 891; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 892; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] 893; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm7 894; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 895; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] 896; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm4 897; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] 898; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9 899; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] 900; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 901; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 902; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] 903; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 904; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 905; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 906; AVX512DQ-BW-FCP-NEXT: vmovd %xmm2, (%rsi) 907; AVX512DQ-BW-FCP-NEXT: vmovd %xmm3, (%rdx) 908; AVX512DQ-BW-FCP-NEXT: vmovd %xmm5, (%rcx) 909; AVX512DQ-BW-FCP-NEXT: vmovd %xmm7, (%r8) 910; AVX512DQ-BW-FCP-NEXT: vmovd %xmm4, (%r9) 911; AVX512DQ-BW-FCP-NEXT: vmovd %xmm6, (%r10) 912; AVX512DQ-BW-FCP-NEXT: vmovd %xmm0, (%rax) 913; AVX512DQ-BW-FCP-NEXT: retq 914 %wide.vec = load <28 x i8>, ptr %in.vec, align 64 915 %strided.vec0 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 0, i32 7, i32 14, i32 21> 916 %strided.vec1 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 1, i32 8, i32 15, i32 22> 917 %strided.vec2 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 2, i32 9, i32 16, i32 23> 918 %strided.vec3 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 3, i32 10, i32 17, i32 24> 919 %strided.vec4 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 4, i32 11, i32 18, i32 25> 920 %strided.vec5 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 5, i32 12, i32 19, i32 26> 921 %strided.vec6 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 6, i32 13, i32 20, i32 27> 922 store <4 x i8> %strided.vec0, ptr %out.vec0, align 64 923 store <4 x i8> %strided.vec1, ptr %out.vec1, align 64 924 store <4 x i8> %strided.vec2, ptr %out.vec2, align 64 925 store <4 x i8> %strided.vec3, ptr %out.vec3, align 64 926 store <4 x i8> %strided.vec4, ptr %out.vec4, align 64 927 store <4 x i8> %strided.vec5, ptr %out.vec5, align 64 928 store <4 x i8> %strided.vec6, ptr %out.vec6, align 64 929 ret void 930} 931 932define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { 933; SSE-LABEL: load_i8_stride7_vf8: 934; SSE: # %bb.0: 935; SSE-NEXT: movdqa (%rdi), %xmm3 936; SSE-NEXT: movdqa 16(%rdi), %xmm11 937; SSE-NEXT: movdqa 32(%rdi), %xmm6 938; SSE-NEXT: movdqa 48(%rdi), %xmm0 939; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,0,65535] 940; SSE-NEXT: movdqa %xmm3, %xmm1 941; SSE-NEXT: pand %xmm2, %xmm1 942; SSE-NEXT: pandn %xmm11, %xmm2 943; SSE-NEXT: por %xmm1, %xmm2 944; SSE-NEXT: pxor %xmm1, %xmm1 945; SSE-NEXT: movdqa %xmm2, %xmm5 946; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] 947; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,65535] 948; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 949; SSE-NEXT: pxor %xmm4, %xmm4 950; SSE-NEXT: pand %xmm7, %xmm2 951; SSE-NEXT: pandn %xmm5, %xmm7 952; SSE-NEXT: por %xmm2, %xmm7 953; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,1,3] 954; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] 955; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,1,1] 956; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[0,3,2,1,4,5,6,7] 957; SSE-NEXT: packuswb %xmm7, %xmm7 958; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] 959; SSE-NEXT: pand %xmm2, %xmm7 960; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,65535,65535,65535,65535,65535] 961; SSE-NEXT: movdqa %xmm6, %xmm5 962; SSE-NEXT: pand %xmm9, %xmm5 963; SSE-NEXT: pandn %xmm0, %xmm9 964; SSE-NEXT: por %xmm5, %xmm9 965; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] 966; SSE-NEXT: movdqa %xmm6, %xmm8 967; SSE-NEXT: movss {{.*#+}} xmm8 = xmm0[0],xmm8[1,2,3] 968; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535] 969; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 970; SSE-NEXT: movdqa %xmm11, %xmm10 971; SSE-NEXT: movdqa %xmm11, %xmm1 972; SSE-NEXT: pand %xmm5, %xmm10 973; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,0,65535,65535,65535,65535,65535] 974; SSE-NEXT: movdqa %xmm6, %xmm4 975; SSE-NEXT: pand %xmm12, %xmm4 976; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 977; SSE-NEXT: pandn %xmm0, %xmm12 978; SSE-NEXT: movaps %xmm0, %xmm14 979; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm6[0,0] 980; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm6[2,3] 981; SSE-NEXT: pand %xmm5, %xmm0 982; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 983; SSE-NEXT: pandn %xmm6, %xmm5 984; SSE-NEXT: movdqa %xmm6, %xmm15 985; SSE-NEXT: pxor %xmm0, %xmm0 986; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] 987; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] 988; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,6,5,6,7] 989; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] 990; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,4,4,5,6] 991; SSE-NEXT: packuswb %xmm9, %xmm9 992; SSE-NEXT: movdqa %xmm2, %xmm11 993; SSE-NEXT: movdqa %xmm2, %xmm13 994; SSE-NEXT: pandn %xmm9, %xmm13 995; SSE-NEXT: por %xmm7, %xmm13 996; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,0,65535] 997; SSE-NEXT: movdqa %xmm7, %xmm9 998; SSE-NEXT: movdqa %xmm1, %xmm4 999; SSE-NEXT: pandn %xmm1, %xmm9 1000; SSE-NEXT: movdqa %xmm3, %xmm2 1001; SSE-NEXT: pand %xmm7, %xmm3 1002; SSE-NEXT: por %xmm9, %xmm3 1003; SSE-NEXT: movdqa %xmm3, %xmm9 1004; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] 1005; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,0,65535] 1006; SSE-NEXT: movdqa %xmm0, %xmm1 1007; SSE-NEXT: pandn %xmm9, %xmm1 1008; SSE-NEXT: pxor %xmm6, %xmm6 1009; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] 1010; SSE-NEXT: pand %xmm0, %xmm3 1011; SSE-NEXT: por %xmm1, %xmm3 1012; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] 1013; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] 1014; SSE-NEXT: movdqa %xmm8, %xmm9 1015; SSE-NEXT: pand %xmm1, %xmm9 1016; SSE-NEXT: pandn %xmm15, %xmm1 1017; SSE-NEXT: por %xmm9, %xmm1 1018; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] 1019; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] 1020; SSE-NEXT: packuswb %xmm1, %xmm1 1021; SSE-NEXT: movdqa %xmm11, %xmm9 1022; SSE-NEXT: pandn %xmm1, %xmm9 1023; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] 1024; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 1025; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 1026; SSE-NEXT: packuswb %xmm1, %xmm1 1027; SSE-NEXT: pand %xmm11, %xmm1 1028; SSE-NEXT: por %xmm1, %xmm9 1029; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1030; SSE-NEXT: pandn %xmm2, %xmm1 1031; SSE-NEXT: por %xmm1, %xmm10 1032; SSE-NEXT: movdqa %xmm10, %xmm1 1033; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] 1034; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] 1035; SSE-NEXT: pand %xmm0, %xmm10 1036; SSE-NEXT: pandn %xmm1, %xmm0 1037; SSE-NEXT: por %xmm10, %xmm0 1038; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1039; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] 1040; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,3] 1041; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] 1042; SSE-NEXT: packuswb %xmm0, %xmm0 1043; SSE-NEXT: pand %xmm11, %xmm0 1044; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] 1045; SSE-NEXT: pand %xmm1, %xmm8 1046; SSE-NEXT: pandn %xmm15, %xmm1 1047; SSE-NEXT: por %xmm8, %xmm1 1048; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] 1049; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] 1050; SSE-NEXT: packuswb %xmm1, %xmm1 1051; SSE-NEXT: movdqa %xmm11, %xmm8 1052; SSE-NEXT: pandn %xmm1, %xmm8 1053; SSE-NEXT: por %xmm0, %xmm8 1054; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,0,65535,65535] 1055; SSE-NEXT: movdqa %xmm4, %xmm1 1056; SSE-NEXT: pand %xmm0, %xmm1 1057; SSE-NEXT: pandn %xmm2, %xmm0 1058; SSE-NEXT: movdqa %xmm2, %xmm10 1059; SSE-NEXT: por %xmm1, %xmm0 1060; SSE-NEXT: movdqa %xmm0, %xmm1 1061; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] 1062; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] 1063; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,65535,65535,65535] 1064; SSE-NEXT: pand %xmm3, %xmm0 1065; SSE-NEXT: pandn %xmm1, %xmm3 1066; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 1067; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi 1068; SSE-NEXT: por %xmm0, %xmm3 1069; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,2,1,0,4,5,6,7] 1070; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] 1071; SSE-NEXT: packuswb %xmm0, %xmm0 1072; SSE-NEXT: pand %xmm11, %xmm0 1073; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 1074; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] 1075; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,7,5,6,7] 1076; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] 1077; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,3,2,3,4,5,6,7] 1078; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] 1079; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6] 1080; SSE-NEXT: packuswb %xmm3, %xmm3 1081; SSE-NEXT: pandn %xmm3, %xmm11 1082; SSE-NEXT: por %xmm0, %xmm11 1083; SSE-NEXT: movdqa %xmm11, %xmm6 1084; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] 1085; SSE-NEXT: movdqa %xmm4, %xmm2 1086; SSE-NEXT: movdqa %xmm4, %xmm3 1087; SSE-NEXT: pand %xmm0, %xmm3 1088; SSE-NEXT: movdqa %xmm10, %xmm11 1089; SSE-NEXT: pandn %xmm10, %xmm0 1090; SSE-NEXT: por %xmm3, %xmm0 1091; SSE-NEXT: movdqa %xmm0, %xmm3 1092; SSE-NEXT: pxor %xmm4, %xmm4 1093; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] 1094; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] 1095; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 1096; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] 1097; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1098; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] 1099; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 1100; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,5] 1101; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,0,65535] 1102; SSE-NEXT: pand %xmm3, %xmm1 1103; SSE-NEXT: pandn %xmm15, %xmm3 1104; SSE-NEXT: por %xmm1, %xmm3 1105; SSE-NEXT: packuswb %xmm3, %xmm0 1106; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,3,2,3] 1107; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,3,2,3] 1108; SSE-NEXT: movdqa %xmm11, %xmm3 1109; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 1110; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1111; SSE-NEXT: movdqa %xmm1, %xmm0 1112; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] 1113; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] 1114; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1115; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 1116; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] 1117; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] 1118; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1119; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0] 1120; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3],xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] 1121; SSE-NEXT: pand %xmm0, %xmm14 1122; SSE-NEXT: pandn %xmm15, %xmm0 1123; SSE-NEXT: por %xmm14, %xmm0 1124; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 1125; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] 1126; SSE-NEXT: packuswb %xmm0, %xmm1 1127; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,3,2,3] 1128; SSE-NEXT: movdqa %xmm2, %xmm0 1129; SSE-NEXT: pand %xmm7, %xmm0 1130; SSE-NEXT: pandn %xmm3, %xmm7 1131; SSE-NEXT: por %xmm0, %xmm7 1132; SSE-NEXT: movdqa %xmm7, %xmm0 1133; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 1134; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] 1135; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] 1136; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,1] 1137; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] 1138; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1139; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 1140; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 1141; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] 1142; SSE-NEXT: pand %xmm1, %xmm5 1143; SSE-NEXT: pandn %xmm15, %xmm1 1144; SSE-NEXT: por %xmm5, %xmm1 1145; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] 1146; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] 1147; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 1148; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] 1149; SSE-NEXT: packuswb %xmm1, %xmm0 1150; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] 1151; SSE-NEXT: movq %xmm13, (%rsi) 1152; SSE-NEXT: movq %xmm9, (%rdx) 1153; SSE-NEXT: movq %xmm8, (%rcx) 1154; SSE-NEXT: movq %xmm6, (%r8) 1155; SSE-NEXT: movq %xmm10, (%r9) 1156; SSE-NEXT: movq %xmm11, (%rdi) 1157; SSE-NEXT: movq %xmm0, (%rax) 1158; SSE-NEXT: retq 1159; 1160; AVX-LABEL: load_i8_stride7_vf8: 1161; AVX: # %bb.0: 1162; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 1163; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 1164; AVX-NEXT: vmovdqa (%rdi), %xmm0 1165; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 1166; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 1167; AVX-NEXT: vmovdqa 48(%rdi), %xmm3 1168; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[5,12,u,u,u,u,u,u,u,u,u,u,u] 1169; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] 1170; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 1171; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2] 1172; AVX-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,0,7,14,0,0,7,14,0,0,7,14,0,0,7,14] 1173; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5 1174; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [255,255,255,255,255,0,0,0,255,255,255,255,255,0,0,0] 1175; AVX-NEXT: # xmm7 = mem[0,0] 1176; AVX-NEXT: vpblendvb %xmm7, %xmm4, %xmm5, %xmm4 1177; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u,u,u,u,u] 1178; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] 1179; AVX-NEXT: vpor %xmm5, %xmm8, %xmm5 1180; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm2[4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3] 1181; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm8 1182; AVX-NEXT: vpblendvb %xmm7, %xmm5, %xmm8, %xmm5 1183; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] 1184; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u] 1185; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8 1186; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] 1187; AVX-NEXT: vpshufb %xmm6, %xmm9, %xmm9 1188; AVX-NEXT: vpblendvb %xmm7, %xmm8, %xmm9, %xmm8 1189; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] 1190; AVX-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm1[1,8,15,u,u,u,u,u,u,u,u,u,u,u] 1191; AVX-NEXT: vpor %xmm9, %xmm10, %xmm9 1192; AVX-NEXT: vpalignr {{.*#+}} xmm10 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5] 1193; AVX-NEXT: vpshufb %xmm6, %xmm10, %xmm6 1194; AVX-NEXT: vpblendvb %xmm7, %xmm9, %xmm6, %xmm6 1195; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] 1196; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] 1197; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7 1198; AVX-NEXT: vmovd {{.*#+}} xmm9 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 1199; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm10 1200; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 1201; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] 1202; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3],xmm10[4,5,6,7] 1203; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u] 1204; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,u,1,8,15],zero,xmm2[u,u,u,u,u,u,u,u] 1205; AVX-NEXT: vpor %xmm10, %xmm11, %xmm10 1206; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 1207; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 1208; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] 1209; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7] 1210; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,0,7,u,u,u,u,u,u,u,u,u,u,u,u] 1211; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,2,9,u,u,u,u,u,u,u,u,u,u,u,u] 1212; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 1213; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm1 1214; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 1215; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1216; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] 1217; AVX-NEXT: vmovq %xmm4, (%rsi) 1218; AVX-NEXT: vmovq %xmm5, (%rdx) 1219; AVX-NEXT: vmovq %xmm8, (%rcx) 1220; AVX-NEXT: vmovq %xmm6, (%r8) 1221; AVX-NEXT: vmovq %xmm7, (%r9) 1222; AVX-NEXT: vmovq %xmm10, (%r10) 1223; AVX-NEXT: vmovq %xmm0, (%rax) 1224; AVX-NEXT: retq 1225; 1226; AVX2-LABEL: load_i8_stride7_vf8: 1227; AVX2: # %bb.0: 1228; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 1229; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 1230; AVX2-NEXT: vmovdqa (%rdi), %ymm0 1231; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 1232; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 1233; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 1234; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 1235; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] 1236; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] 1237; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 1238; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] 1239; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 1240; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] 1241; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] 1242; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 1243; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 1244; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 1245; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 1246; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] 1247; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] 1248; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm4 1249; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] 1250; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 1251; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] 1252; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] 1253; AVX2-NEXT: vpor %xmm6, %xmm5, %xmm5 1254; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] 1255; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 1256; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 1257; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] 1258; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] 1259; AVX2-NEXT: vpor %xmm7, %xmm6, %xmm6 1260; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] 1261; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8 1262; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] 1263; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] 1264; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7 1265; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] 1266; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 1267; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1268; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] 1269; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] 1270; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1271; AVX2-NEXT: vmovq %xmm2, (%rsi) 1272; AVX2-NEXT: vmovq %xmm3, (%rdx) 1273; AVX2-NEXT: vmovq %xmm4, (%rcx) 1274; AVX2-NEXT: vmovq %xmm5, (%r8) 1275; AVX2-NEXT: vmovq %xmm6, (%r9) 1276; AVX2-NEXT: vmovq %xmm7, (%r10) 1277; AVX2-NEXT: vmovq %xmm0, (%rax) 1278; AVX2-NEXT: vzeroupper 1279; AVX2-NEXT: retq 1280; 1281; AVX2-FP-LABEL: load_i8_stride7_vf8: 1282; AVX2-FP: # %bb.0: 1283; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1284; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1285; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 1286; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 1287; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 1288; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 1289; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 1290; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] 1291; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] 1292; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 1293; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] 1294; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 1295; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] 1296; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] 1297; AVX2-FP-NEXT: vpor %xmm4, %xmm3, %xmm3 1298; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 1299; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 1300; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5 1301; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] 1302; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] 1303; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm4 1304; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] 1305; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6 1306; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] 1307; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] 1308; AVX2-FP-NEXT: vpor %xmm6, %xmm5, %xmm5 1309; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] 1310; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 1311; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7 1312; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] 1313; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] 1314; AVX2-FP-NEXT: vpor %xmm7, %xmm6, %xmm6 1315; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] 1316; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8 1317; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] 1318; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] 1319; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm7 1320; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] 1321; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 1322; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 1323; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] 1324; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] 1325; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1326; AVX2-FP-NEXT: vmovq %xmm2, (%rsi) 1327; AVX2-FP-NEXT: vmovq %xmm3, (%rdx) 1328; AVX2-FP-NEXT: vmovq %xmm4, (%rcx) 1329; AVX2-FP-NEXT: vmovq %xmm5, (%r8) 1330; AVX2-FP-NEXT: vmovq %xmm6, (%r9) 1331; AVX2-FP-NEXT: vmovq %xmm7, (%r10) 1332; AVX2-FP-NEXT: vmovq %xmm0, (%rax) 1333; AVX2-FP-NEXT: vzeroupper 1334; AVX2-FP-NEXT: retq 1335; 1336; AVX2-FCP-LABEL: load_i8_stride7_vf8: 1337; AVX2-FCP: # %bb.0: 1338; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1339; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1340; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 1341; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 1342; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 1343; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 1344; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 1345; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] 1346; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] 1347; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 1348; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] 1349; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 1350; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] 1351; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] 1352; AVX2-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 1353; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 1354; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 1355; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 1356; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] 1357; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] 1358; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 1359; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] 1360; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 1361; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] 1362; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] 1363; AVX2-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 1364; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0] 1365; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6 1366; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 1367; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] 1368; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] 1369; AVX2-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 1370; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] 1371; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 1372; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] 1373; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] 1374; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 1375; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0] 1376; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 1377; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 1378; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] 1379; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] 1380; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1381; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi) 1382; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx) 1383; AVX2-FCP-NEXT: vmovq %xmm4, (%rcx) 1384; AVX2-FCP-NEXT: vmovq %xmm5, (%r8) 1385; AVX2-FCP-NEXT: vmovq %xmm6, (%r9) 1386; AVX2-FCP-NEXT: vmovq %xmm7, (%r10) 1387; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) 1388; AVX2-FCP-NEXT: vzeroupper 1389; AVX2-FCP-NEXT: retq 1390; 1391; AVX512-LABEL: load_i8_stride7_vf8: 1392; AVX512: # %bb.0: 1393; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 1394; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 1395; AVX512-NEXT: vmovdqa (%rdi), %ymm0 1396; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 1397; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] 1398; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1)) 1399; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 1400; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] 1401; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] 1402; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 1403; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] 1404; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 1405; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] 1406; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] 1407; AVX512-NEXT: vpor %xmm4, %xmm3, %xmm3 1408; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] 1409; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) 1410; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5 1411; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] 1412; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] 1413; AVX512-NEXT: vpor %xmm5, %xmm4, %xmm4 1414; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] 1415; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 1416; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] 1417; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] 1418; AVX512-NEXT: vpor %xmm6, %xmm5, %xmm5 1419; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] 1420; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0)) 1421; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7 1422; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] 1423; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] 1424; AVX512-NEXT: vpor %xmm7, %xmm6, %xmm6 1425; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] 1426; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 1427; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] 1428; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] 1429; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm7 1430; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) 1431; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1432; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] 1433; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] 1434; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1435; AVX512-NEXT: vmovq %xmm2, (%rsi) 1436; AVX512-NEXT: vmovq %xmm3, (%rdx) 1437; AVX512-NEXT: vmovq %xmm4, (%rcx) 1438; AVX512-NEXT: vmovq %xmm5, (%r8) 1439; AVX512-NEXT: vmovq %xmm6, (%r9) 1440; AVX512-NEXT: vmovq %xmm7, (%r10) 1441; AVX512-NEXT: vmovq %xmm0, (%rax) 1442; AVX512-NEXT: vzeroupper 1443; AVX512-NEXT: retq 1444; 1445; AVX512-FCP-LABEL: load_i8_stride7_vf8: 1446; AVX512-FCP: # %bb.0: 1447; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1448; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1449; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 1450; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 1451; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] 1452; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1)) 1453; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 1454; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] 1455; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] 1456; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 1457; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] 1458; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 1459; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] 1460; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] 1461; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 1462; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] 1463; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) 1464; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 1465; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] 1466; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] 1467; AVX512-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 1468; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] 1469; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 1470; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] 1471; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] 1472; AVX512-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 1473; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] 1474; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0)) 1475; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 1476; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] 1477; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] 1478; AVX512-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 1479; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] 1480; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 1481; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] 1482; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] 1483; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 1484; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) 1485; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 1486; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] 1487; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] 1488; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1489; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) 1490; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) 1491; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx) 1492; AVX512-FCP-NEXT: vmovq %xmm5, (%r8) 1493; AVX512-FCP-NEXT: vmovq %xmm6, (%r9) 1494; AVX512-FCP-NEXT: vmovq %xmm7, (%r10) 1495; AVX512-FCP-NEXT: vmovq %xmm0, (%rax) 1496; AVX512-FCP-NEXT: vzeroupper 1497; AVX512-FCP-NEXT: retq 1498; 1499; AVX512DQ-LABEL: load_i8_stride7_vf8: 1500; AVX512DQ: # %bb.0: 1501; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 1502; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 1503; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 1504; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 1505; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] 1506; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1)) 1507; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 1508; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] 1509; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] 1510; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 1511; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] 1512; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4 1513; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] 1514; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] 1515; AVX512DQ-NEXT: vpor %xmm4, %xmm3, %xmm3 1516; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] 1517; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) 1518; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm5 1519; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] 1520; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] 1521; AVX512DQ-NEXT: vpor %xmm5, %xmm4, %xmm4 1522; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] 1523; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 1524; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] 1525; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] 1526; AVX512DQ-NEXT: vpor %xmm6, %xmm5, %xmm5 1527; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] 1528; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0)) 1529; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm7 1530; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] 1531; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] 1532; AVX512DQ-NEXT: vpor %xmm7, %xmm6, %xmm6 1533; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] 1534; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8 1535; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] 1536; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] 1537; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm7 1538; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) 1539; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 1540; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] 1541; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] 1542; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1543; AVX512DQ-NEXT: vmovq %xmm2, (%rsi) 1544; AVX512DQ-NEXT: vmovq %xmm3, (%rdx) 1545; AVX512DQ-NEXT: vmovq %xmm4, (%rcx) 1546; AVX512DQ-NEXT: vmovq %xmm5, (%r8) 1547; AVX512DQ-NEXT: vmovq %xmm6, (%r9) 1548; AVX512DQ-NEXT: vmovq %xmm7, (%r10) 1549; AVX512DQ-NEXT: vmovq %xmm0, (%rax) 1550; AVX512DQ-NEXT: vzeroupper 1551; AVX512DQ-NEXT: retq 1552; 1553; AVX512DQ-FCP-LABEL: load_i8_stride7_vf8: 1554; AVX512DQ-FCP: # %bb.0: 1555; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1556; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1557; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 1558; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 1559; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] 1560; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1)) 1561; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 1562; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] 1563; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] 1564; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 1565; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15] 1566; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 1567; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] 1568; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] 1569; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 1570; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535] 1571; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1)) 1572; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 1573; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] 1574; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] 1575; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 1576; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7] 1577; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 1578; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] 1579; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] 1580; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 1581; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] 1582; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0)) 1583; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 1584; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] 1585; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] 1586; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 1587; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15] 1588; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 1589; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] 1590; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] 1591; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 1592; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) 1593; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 1594; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] 1595; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] 1596; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1597; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) 1598; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) 1599; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx) 1600; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8) 1601; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r9) 1602; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r10) 1603; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax) 1604; AVX512DQ-FCP-NEXT: vzeroupper 1605; AVX512DQ-FCP-NEXT: retq 1606; 1607; AVX512BW-LABEL: load_i8_stride7_vf8: 1608; AVX512BW: # %bb.0: 1609; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 1610; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 1611; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 1612; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0 1613; AVX512BW-NEXT: movw $290, %di # imm = 0x122 1614; AVX512BW-NEXT: kmovd %edi, %k1 1615; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} 1616; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 1617; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] 1618; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] 1619; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 1620; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] 1621; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4 1622; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] 1623; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] 1624; AVX512BW-NEXT: vpor %xmm4, %xmm3, %xmm3 1625; AVX512BW-NEXT: movw $580, %di # imm = 0x244 1626; AVX512BW-NEXT: kmovd %edi, %k1 1627; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1} 1628; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm5 1629; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] 1630; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] 1631; AVX512BW-NEXT: vpor %xmm5, %xmm4, %xmm4 1632; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] 1633; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm6 1634; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] 1635; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] 1636; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm5 1637; AVX512BW-NEXT: movw $4644, %di # imm = 0x1224 1638; AVX512BW-NEXT: kmovd %edi, %k1 1639; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1} 1640; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7 1641; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] 1642; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] 1643; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6 1644; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] 1645; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm8 1646; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] 1647; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] 1648; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 1649; AVX512BW-NEXT: movw $9288, %di # imm = 0x2448 1650; AVX512BW-NEXT: kmovd %edi, %k1 1651; AVX512BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} 1652; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 1653; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] 1654; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] 1655; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1656; AVX512BW-NEXT: vmovq %xmm2, (%rsi) 1657; AVX512BW-NEXT: vmovq %xmm3, (%rdx) 1658; AVX512BW-NEXT: vmovq %xmm4, (%rcx) 1659; AVX512BW-NEXT: vmovq %xmm5, (%r8) 1660; AVX512BW-NEXT: vmovq %xmm6, (%r9) 1661; AVX512BW-NEXT: vmovq %xmm7, (%r10) 1662; AVX512BW-NEXT: vmovq %xmm0, (%rax) 1663; AVX512BW-NEXT: vzeroupper 1664; AVX512BW-NEXT: retq 1665; 1666; AVX512BW-FCP-LABEL: load_i8_stride7_vf8: 1667; AVX512BW-FCP: # %bb.0: 1668; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1669; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1670; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 1671; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 1672; AVX512BW-FCP-NEXT: movw $290, %di # imm = 0x122 1673; AVX512BW-FCP-NEXT: kmovd %edi, %k1 1674; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} 1675; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 1676; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] 1677; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] 1678; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 1679; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] 1680; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 1681; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] 1682; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] 1683; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 1684; AVX512BW-FCP-NEXT: movw $580, %di # imm = 0x244 1685; AVX512BW-FCP-NEXT: kmovd %edi, %k1 1686; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1} 1687; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 1688; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] 1689; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] 1690; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 1691; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] 1692; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 1693; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] 1694; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] 1695; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 1696; AVX512BW-FCP-NEXT: movw $4644, %di # imm = 0x1224 1697; AVX512BW-FCP-NEXT: kmovd %edi, %k1 1698; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1} 1699; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 1700; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] 1701; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] 1702; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 1703; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] 1704; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 1705; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] 1706; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] 1707; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 1708; AVX512BW-FCP-NEXT: movw $9288, %di # imm = 0x2448 1709; AVX512BW-FCP-NEXT: kmovd %edi, %k1 1710; AVX512BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} 1711; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 1712; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] 1713; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] 1714; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1715; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) 1716; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) 1717; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx) 1718; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r8) 1719; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r9) 1720; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10) 1721; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax) 1722; AVX512BW-FCP-NEXT: vzeroupper 1723; AVX512BW-FCP-NEXT: retq 1724; 1725; AVX512DQ-BW-LABEL: load_i8_stride7_vf8: 1726; AVX512DQ-BW: # %bb.0: 1727; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 1728; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 1729; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 1730; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm0 1731; AVX512DQ-BW-NEXT: movw $290, %di # imm = 0x122 1732; AVX512DQ-BW-NEXT: kmovd %edi, %k1 1733; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} 1734; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 1735; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] 1736; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] 1737; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 1738; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] 1739; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm4 1740; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] 1741; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] 1742; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm3, %xmm3 1743; AVX512DQ-BW-NEXT: movw $580, %di # imm = 0x244 1744; AVX512DQ-BW-NEXT: kmovd %edi, %k1 1745; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1} 1746; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm4, %xmm5 1747; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] 1748; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] 1749; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm4, %xmm4 1750; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] 1751; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm6 1752; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] 1753; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] 1754; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm5 1755; AVX512DQ-BW-NEXT: movw $4644, %di # imm = 0x1224 1756; AVX512DQ-BW-NEXT: kmovd %edi, %k1 1757; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1} 1758; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm7 1759; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] 1760; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] 1761; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm6, %xmm6 1762; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] 1763; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm8 1764; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] 1765; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] 1766; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7 1767; AVX512DQ-BW-NEXT: movw $9288, %di # imm = 0x2448 1768; AVX512DQ-BW-NEXT: kmovd %edi, %k1 1769; AVX512DQ-BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} 1770; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1 1771; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] 1772; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] 1773; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1774; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) 1775; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx) 1776; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx) 1777; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r8) 1778; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r9) 1779; AVX512DQ-BW-NEXT: vmovq %xmm7, (%r10) 1780; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax) 1781; AVX512DQ-BW-NEXT: vzeroupper 1782; AVX512DQ-BW-NEXT: retq 1783; 1784; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf8: 1785; AVX512DQ-BW-FCP: # %bb.0: 1786; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1787; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 1788; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 1789; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 1790; AVX512DQ-BW-FCP-NEXT: movw $290, %di # imm = 0x122 1791; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 1792; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} 1793; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 1794; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u] 1795; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u] 1796; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 1797; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15] 1798; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 1799; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u] 1800; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u] 1801; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 1802; AVX512DQ-BW-FCP-NEXT: movw $580, %di # imm = 0x244 1803; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 1804; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1} 1805; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 1806; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u] 1807; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u] 1808; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 1809; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7] 1810; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 1811; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u] 1812; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u] 1813; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 1814; AVX512DQ-BW-FCP-NEXT: movw $4644, %di # imm = 0x1224 1815; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 1816; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1} 1817; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 1818; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u] 1819; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u] 1820; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 1821; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15] 1822; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 1823; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u] 1824; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u] 1825; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 1826; AVX512DQ-BW-FCP-NEXT: movw $9288, %di # imm = 0x2448 1827; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 1828; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} 1829; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 1830; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u] 1831; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u] 1832; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1833; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) 1834; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) 1835; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx) 1836; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r8) 1837; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r9) 1838; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10) 1839; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax) 1840; AVX512DQ-BW-FCP-NEXT: vzeroupper 1841; AVX512DQ-BW-FCP-NEXT: retq 1842 %wide.vec = load <56 x i8>, ptr %in.vec, align 64 1843 %strided.vec0 = shufflevector <56 x i8> %wide.vec, <56 x i8> poison, <8 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49> 1844 %strided.vec1 = shufflevector <56 x i8> %wide.vec, <56 x i8> poison, <8 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50> 1845 %strided.vec2 = shufflevector <56 x i8> %wide.vec, <56 x i8> poison, <8 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51> 1846 %strided.vec3 = shufflevector <56 x i8> %wide.vec, <56 x i8> poison, <8 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52> 1847 %strided.vec4 = shufflevector <56 x i8> %wide.vec, <56 x i8> poison, <8 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53> 1848 %strided.vec5 = shufflevector <56 x i8> %wide.vec, <56 x i8> poison, <8 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54> 1849 %strided.vec6 = shufflevector <56 x i8> %wide.vec, <56 x i8> poison, <8 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55> 1850 store <8 x i8> %strided.vec0, ptr %out.vec0, align 64 1851 store <8 x i8> %strided.vec1, ptr %out.vec1, align 64 1852 store <8 x i8> %strided.vec2, ptr %out.vec2, align 64 1853 store <8 x i8> %strided.vec3, ptr %out.vec3, align 64 1854 store <8 x i8> %strided.vec4, ptr %out.vec4, align 64 1855 store <8 x i8> %strided.vec5, ptr %out.vec5, align 64 1856 store <8 x i8> %strided.vec6, ptr %out.vec6, align 64 1857 ret void 1858} 1859 1860define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { 1861; SSE-LABEL: load_i8_stride7_vf16: 1862; SSE: # %bb.0: 1863; SSE-NEXT: subq $168, %rsp 1864; SSE-NEXT: movdqa 96(%rdi), %xmm15 1865; SSE-NEXT: movdqa 80(%rdi), %xmm4 1866; SSE-NEXT: movdqa 64(%rdi), %xmm7 1867; SSE-NEXT: movdqa (%rdi), %xmm6 1868; SSE-NEXT: movdqa 16(%rdi), %xmm9 1869; SSE-NEXT: movdqa 32(%rdi), %xmm12 1870; SSE-NEXT: movdqa 48(%rdi), %xmm8 1871; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535] 1872; SSE-NEXT: movdqa %xmm2, %xmm0 1873; SSE-NEXT: pandn %xmm12, %xmm0 1874; SSE-NEXT: movdqa %xmm8, %xmm1 1875; SSE-NEXT: pand %xmm2, %xmm1 1876; SSE-NEXT: por %xmm0, %xmm1 1877; SSE-NEXT: pxor %xmm13, %xmm13 1878; SSE-NEXT: movdqa %xmm1, %xmm0 1879; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] 1880; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] 1881; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1882; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 1883; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] 1884; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] 1885; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] 1886; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] 1887; SSE-NEXT: packuswb %xmm0, %xmm1 1888; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] 1889; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,65535,0,65535] 1890; SSE-NEXT: movdqa %xmm10, %xmm0 1891; SSE-NEXT: pandn %xmm9, %xmm0 1892; SSE-NEXT: movdqa %xmm6, %xmm3 1893; SSE-NEXT: movdqa %xmm6, %xmm11 1894; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1895; SSE-NEXT: pand %xmm10, %xmm3 1896; SSE-NEXT: por %xmm0, %xmm3 1897; SSE-NEXT: movdqa %xmm3, %xmm0 1898; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] 1899; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,0,65535,0,65535] 1900; SSE-NEXT: movdqa %xmm14, %xmm6 1901; SSE-NEXT: pandn %xmm0, %xmm6 1902; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3],xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] 1903; SSE-NEXT: pand %xmm14, %xmm3 1904; SSE-NEXT: por %xmm6, %xmm3 1905; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,1,3] 1906; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] 1907; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,1] 1908; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] 1909; SSE-NEXT: packuswb %xmm0, %xmm0 1910; SSE-NEXT: pand %xmm2, %xmm0 1911; SSE-NEXT: pandn %xmm1, %xmm2 1912; SSE-NEXT: por %xmm2, %xmm0 1913; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,0,65535] 1914; SSE-NEXT: movdqa %xmm3, %xmm1 1915; SSE-NEXT: pandn %xmm7, %xmm1 1916; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1917; SSE-NEXT: movdqa %xmm4, %xmm2 1918; SSE-NEXT: movdqa %xmm4, %xmm5 1919; SSE-NEXT: pand %xmm3, %xmm2 1920; SSE-NEXT: movdqa %xmm3, %xmm13 1921; SSE-NEXT: por %xmm1, %xmm2 1922; SSE-NEXT: movdqa %xmm2, %xmm1 1923; SSE-NEXT: pxor %xmm6, %xmm6 1924; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] 1925; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] 1926; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] 1927; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 1928; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] 1929; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1930; SSE-NEXT: movdqa %xmm15, %xmm2 1931; SSE-NEXT: movdqa %xmm15, %xmm3 1932; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] 1933; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1934; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 1935; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1936; SSE-NEXT: pxor %xmm15, %xmm15 1937; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 1938; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 1939; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] 1940; SSE-NEXT: packuswb %xmm2, %xmm2 1941; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,65535,0] 1942; SSE-NEXT: movdqa %xmm4, %xmm3 1943; SSE-NEXT: pandn %xmm2, %xmm3 1944; SSE-NEXT: packuswb %xmm1, %xmm1 1945; SSE-NEXT: pand %xmm4, %xmm1 1946; SSE-NEXT: por %xmm1, %xmm3 1947; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,0] 1948; SSE-NEXT: pand %xmm1, %xmm0 1949; SSE-NEXT: pandn %xmm3, %xmm1 1950; SSE-NEXT: por %xmm0, %xmm1 1951; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1952; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,0,65535,65535] 1953; SSE-NEXT: movdqa %xmm2, %xmm0 1954; SSE-NEXT: pandn %xmm12, %xmm0 1955; SSE-NEXT: movdqa %xmm8, %xmm1 1956; SSE-NEXT: pand %xmm2, %xmm1 1957; SSE-NEXT: por %xmm0, %xmm1 1958; SSE-NEXT: movdqa %xmm1, %xmm0 1959; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] 1960; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] 1961; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] 1962; SSE-NEXT: pand %xmm2, %xmm1 1963; SSE-NEXT: pandn %xmm0, %xmm2 1964; SSE-NEXT: por %xmm1, %xmm2 1965; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,2,1] 1966; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] 1967; SSE-NEXT: psrld $16, %xmm0 1968; SSE-NEXT: packuswb %xmm0, %xmm1 1969; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] 1970; SSE-NEXT: movdqa %xmm4, %xmm0 1971; SSE-NEXT: pandn %xmm1, %xmm0 1972; SSE-NEXT: movdqa %xmm13, %xmm1 1973; SSE-NEXT: pandn %xmm9, %xmm1 1974; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1975; SSE-NEXT: movdqa %xmm11, %xmm2 1976; SSE-NEXT: pand %xmm13, %xmm2 1977; SSE-NEXT: movdqa %xmm13, %xmm11 1978; SSE-NEXT: por %xmm1, %xmm2 1979; SSE-NEXT: movdqa %xmm2, %xmm1 1980; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] 1981; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,0,65535] 1982; SSE-NEXT: movdqa %xmm6, %xmm3 1983; SSE-NEXT: pandn %xmm1, %xmm3 1984; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] 1985; SSE-NEXT: pand %xmm6, %xmm2 1986; SSE-NEXT: por %xmm3, %xmm2 1987; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] 1988; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 1989; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm1[0,1,2,3,5,5,5,5] 1990; SSE-NEXT: packuswb %xmm13, %xmm13 1991; SSE-NEXT: pand %xmm4, %xmm13 1992; SSE-NEXT: por %xmm0, %xmm13 1993; SSE-NEXT: movdqa %xmm10, %xmm0 1994; SSE-NEXT: pandn %xmm5, %xmm0 1995; SSE-NEXT: movdqa %xmm5, %xmm6 1996; SSE-NEXT: movdqa %xmm7, %xmm1 1997; SSE-NEXT: pand %xmm10, %xmm1 1998; SSE-NEXT: por %xmm0, %xmm1 1999; SSE-NEXT: movdqa %xmm1, %xmm0 2000; SSE-NEXT: pxor %xmm2, %xmm2 2001; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 2002; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 2003; SSE-NEXT: pand %xmm14, %xmm1 2004; SSE-NEXT: pandn %xmm0, %xmm14 2005; SSE-NEXT: por %xmm1, %xmm14 2006; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2007; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2008; SSE-NEXT: movdqa %xmm12, %xmm0 2009; SSE-NEXT: pand %xmm10, %xmm0 2010; SSE-NEXT: pandn %xmm8, %xmm10 2011; SSE-NEXT: por %xmm0, %xmm10 2012; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,65535] 2013; SSE-NEXT: movdqa %xmm9, %xmm7 2014; SSE-NEXT: pand %xmm14, %xmm7 2015; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2016; SSE-NEXT: movdqa %xmm5, %xmm15 2017; SSE-NEXT: pand %xmm14, %xmm15 2018; SSE-NEXT: movdqa %xmm11, %xmm3 2019; SSE-NEXT: pandn %xmm8, %xmm3 2020; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2021; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm8[0,2,2,3] 2022; SSE-NEXT: movdqa %xmm8, %xmm4 2023; SSE-NEXT: pand %xmm14, %xmm8 2024; SSE-NEXT: movdqa %xmm14, %xmm9 2025; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2026; SSE-NEXT: pandn %xmm12, %xmm14 2027; SSE-NEXT: por %xmm8, %xmm14 2028; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2029; SSE-NEXT: movdqa %xmm0, %xmm5 2030; SSE-NEXT: pslld $16, %xmm5 2031; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 2032; SSE-NEXT: movdqa %xmm8, %xmm3 2033; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 2034; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] 2035; SSE-NEXT: movdqa %xmm8, %xmm1 2036; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2037; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2038; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2039; SSE-NEXT: pxor %xmm1, %xmm1 2040; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] 2041; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,7] 2042; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] 2043; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,6,4,6,5] 2044; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,65535,0,65535] 2045; SSE-NEXT: pand %xmm12, %xmm10 2046; SSE-NEXT: movdqa %xmm8, %xmm2 2047; SSE-NEXT: pand %xmm12, %xmm2 2048; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2049; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill 2050; SSE-NEXT: pandn %xmm0, %xmm12 2051; SSE-NEXT: movdqa %xmm0, %xmm2 2052; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2053; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3],xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] 2054; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] 2055; SSE-NEXT: pand %xmm0, %xmm14 2056; SSE-NEXT: pand %xmm0, %xmm2 2057; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2058; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2059; SSE-NEXT: pandn %xmm8, %xmm0 2060; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2061; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] 2062; SSE-NEXT: packuswb %xmm8, %xmm5 2063; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,65535,0] 2064; SSE-NEXT: movdqa %xmm0, %xmm8 2065; SSE-NEXT: pandn %xmm5, %xmm8 2066; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 2067; SSE-NEXT: # xmm5 = mem[0,3,2,3] 2068; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,3,2,4,5,6,7] 2069; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] 2070; SSE-NEXT: packuswb %xmm5, %xmm5 2071; SSE-NEXT: pand %xmm0, %xmm5 2072; SSE-NEXT: por %xmm5, %xmm8 2073; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] 2074; SSE-NEXT: movdqa %xmm5, %xmm0 2075; SSE-NEXT: pandn %xmm8, %xmm0 2076; SSE-NEXT: pand %xmm5, %xmm13 2077; SSE-NEXT: por %xmm13, %xmm0 2078; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2079; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,0,65535] 2080; SSE-NEXT: movdqa %xmm2, %xmm8 2081; SSE-NEXT: pandn %xmm6, %xmm8 2082; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2083; SSE-NEXT: pand %xmm2, %xmm0 2084; SSE-NEXT: por %xmm8, %xmm0 2085; SSE-NEXT: movdqa %xmm0, %xmm8 2086; SSE-NEXT: pxor %xmm6, %xmm6 2087; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] 2088; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,0,65535] 2089; SSE-NEXT: movdqa %xmm13, %xmm1 2090; SSE-NEXT: pandn %xmm8, %xmm1 2091; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] 2092; SSE-NEXT: pxor %xmm8, %xmm8 2093; SSE-NEXT: pand %xmm13, %xmm0 2094; SSE-NEXT: por %xmm1, %xmm0 2095; SSE-NEXT: packuswb %xmm3, %xmm1 2096; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0] 2097; SSE-NEXT: movdqa %xmm6, %xmm3 2098; SSE-NEXT: pandn %xmm1, %xmm3 2099; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] 2100; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] 2101; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] 2102; SSE-NEXT: packuswb %xmm0, %xmm0 2103; SSE-NEXT: pand %xmm6, %xmm0 2104; SSE-NEXT: por %xmm0, %xmm3 2105; SSE-NEXT: movdqa %xmm5, %xmm0 2106; SSE-NEXT: pandn %xmm3, %xmm0 2107; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 2108; SSE-NEXT: pandn %xmm6, %xmm9 2109; SSE-NEXT: por %xmm9, %xmm7 2110; SSE-NEXT: movdqa %xmm7, %xmm1 2111; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] 2112; SSE-NEXT: movdqa %xmm13, %xmm3 2113; SSE-NEXT: pandn %xmm1, %xmm3 2114; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] 2115; SSE-NEXT: pand %xmm13, %xmm7 2116; SSE-NEXT: por %xmm3, %xmm7 2117; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 2118; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,3,2,3] 2119; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] 2120; SSE-NEXT: movdqa %xmm11, %xmm1 2121; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] 2122; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] 2123; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535] 2124; SSE-NEXT: pand %xmm3, %xmm11 2125; SSE-NEXT: pandn %xmm1, %xmm3 2126; SSE-NEXT: por %xmm11, %xmm3 2127; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] 2128; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] 2129; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 2130; SSE-NEXT: packuswb %xmm1, %xmm3 2131; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] 2132; SSE-NEXT: movdqa %xmm13, %xmm8 2133; SSE-NEXT: pandn %xmm3, %xmm8 2134; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,2,1,3] 2135; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] 2136; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] 2137; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] 2138; SSE-NEXT: packuswb %xmm1, %xmm1 2139; SSE-NEXT: pand %xmm13, %xmm1 2140; SSE-NEXT: por %xmm1, %xmm8 2141; SSE-NEXT: pand %xmm5, %xmm8 2142; SSE-NEXT: por %xmm0, %xmm8 2143; SSE-NEXT: movdqa %xmm2, %xmm0 2144; SSE-NEXT: pandn %xmm9, %xmm0 2145; SSE-NEXT: pand %xmm2, %xmm4 2146; SSE-NEXT: por %xmm0, %xmm4 2147; SSE-NEXT: movdqa %xmm4, %xmm0 2148; SSE-NEXT: pxor %xmm1, %xmm1 2149; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 2150; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] 2151; SSE-NEXT: pxor %xmm2, %xmm2 2152; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 2153; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] 2154; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 2155; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,7,6] 2156; SSE-NEXT: psrlq $48, %xmm0 2157; SSE-NEXT: packuswb %xmm0, %xmm3 2158; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,0,65535,65535] 2159; SSE-NEXT: movdqa %xmm1, %xmm0 2160; SSE-NEXT: movdqa %xmm6, %xmm7 2161; SSE-NEXT: pandn %xmm6, %xmm0 2162; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 2163; SSE-NEXT: movdqa %xmm9, %xmm4 2164; SSE-NEXT: pand %xmm1, %xmm4 2165; SSE-NEXT: por %xmm0, %xmm4 2166; SSE-NEXT: movdqa %xmm4, %xmm0 2167; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 2168; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,65535,65535,65535] 2169; SSE-NEXT: movdqa %xmm1, %xmm6 2170; SSE-NEXT: pandn %xmm0, %xmm6 2171; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] 2172; SSE-NEXT: pand %xmm1, %xmm4 2173; SSE-NEXT: por %xmm6, %xmm4 2174; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[3,2,1,0,4,5,6,7] 2175; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,7,7,7,7] 2176; SSE-NEXT: packuswb %xmm4, %xmm4 2177; SSE-NEXT: pand %xmm13, %xmm4 2178; SSE-NEXT: pandn %xmm3, %xmm13 2179; SSE-NEXT: por %xmm13, %xmm4 2180; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 2181; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2182; SSE-NEXT: pandn %xmm6, %xmm0 2183; SSE-NEXT: por %xmm0, %xmm15 2184; SSE-NEXT: movdqa %xmm15, %xmm0 2185; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 2186; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3],xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] 2187; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] 2188; SSE-NEXT: pand %xmm3, %xmm15 2189; SSE-NEXT: pandn %xmm0, %xmm3 2190; SSE-NEXT: por %xmm15, %xmm3 2191; SSE-NEXT: movdqa %xmm3, %xmm11 2192; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 2193; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[2,1,2,3,4,5,6,7] 2194; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 2195; SSE-NEXT: packuswb %xmm0, %xmm0 2196; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,65535,0] 2197; SSE-NEXT: movdqa %xmm2, %xmm3 2198; SSE-NEXT: pandn %xmm0, %xmm3 2199; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,2,1,0,4,5,6,7] 2200; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] 2201; SSE-NEXT: packuswb %xmm0, %xmm0 2202; SSE-NEXT: pand %xmm2, %xmm0 2203; SSE-NEXT: por %xmm0, %xmm3 2204; SSE-NEXT: movdqa %xmm5, %xmm15 2205; SSE-NEXT: pandn %xmm3, %xmm15 2206; SSE-NEXT: pand %xmm5, %xmm4 2207; SSE-NEXT: por %xmm4, %xmm15 2208; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] 2209; SSE-NEXT: movdqa %xmm0, %xmm3 2210; SSE-NEXT: pandn %xmm7, %xmm3 2211; SSE-NEXT: movdqa %xmm9, %xmm4 2212; SSE-NEXT: pand %xmm0, %xmm4 2213; SSE-NEXT: por %xmm3, %xmm4 2214; SSE-NEXT: movdqa %xmm4, %xmm3 2215; SSE-NEXT: pxor %xmm0, %xmm0 2216; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] 2217; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] 2218; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 2219; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] 2220; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] 2221; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 2222; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 2223; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 2224; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload 2225; SSE-NEXT: pandn %xmm2, %xmm3 2226; SSE-NEXT: por %xmm3, %xmm10 2227; SSE-NEXT: packuswb %xmm2, %xmm10 2228; SSE-NEXT: packuswb %xmm4, %xmm4 2229; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,3,3] 2230; SSE-NEXT: movss {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3] 2231; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 2232; SSE-NEXT: movdqa %xmm7, %xmm3 2233; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,65535,0,65535,65535] 2234; SSE-NEXT: pand %xmm4, %xmm3 2235; SSE-NEXT: pandn %xmm6, %xmm4 2236; SSE-NEXT: movdqa %xmm6, %xmm11 2237; SSE-NEXT: por %xmm3, %xmm4 2238; SSE-NEXT: movdqa %xmm4, %xmm3 2239; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 2240; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 2241; SSE-NEXT: pxor %xmm10, %xmm10 2242; SSE-NEXT: pand %xmm1, %xmm4 2243; SSE-NEXT: pandn %xmm3, %xmm1 2244; SSE-NEXT: por %xmm4, %xmm1 2245; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] 2246; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] 2247; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] 2248; SSE-NEXT: packuswb %xmm1, %xmm1 2249; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,65535,0] 2250; SSE-NEXT: pand %xmm0, %xmm1 2251; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,2,1] 2252; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] 2253; SSE-NEXT: packuswb %xmm3, %xmm3 2254; SSE-NEXT: pandn %xmm3, %xmm0 2255; SSE-NEXT: por %xmm1, %xmm0 2256; SSE-NEXT: movdqa %xmm5, %xmm1 2257; SSE-NEXT: pandn %xmm0, %xmm1 2258; SSE-NEXT: andps %xmm5, %xmm2 2259; SSE-NEXT: por %xmm2, %xmm1 2260; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 2261; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,65535,65535,0,65535] 2262; SSE-NEXT: pand %xmm13, %xmm2 2263; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 2264; SSE-NEXT: movdqa %xmm2, %xmm3 2265; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] 2266; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] 2267; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,65535,0] 2268; SSE-NEXT: pand %xmm4, %xmm2 2269; SSE-NEXT: pandn %xmm3, %xmm4 2270; SSE-NEXT: por %xmm2, %xmm4 2271; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3] 2272; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,5,4,7,6] 2273; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 2274; SSE-NEXT: packuswb %xmm3, %xmm6 2275; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,2,2,3] 2276; SSE-NEXT: pand %xmm13, %xmm9 2277; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 2278; SSE-NEXT: pandn %xmm3, %xmm13 2279; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] 2280; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 2281; SSE-NEXT: movdqa %xmm2, %xmm3 2282; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] 2283; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] 2284; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] 2285; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] 2286; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] 2287; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] 2288; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 2289; SSE-NEXT: packuswb %xmm2, %xmm2 2290; SSE-NEXT: movss {{.*#+}} xmm6 = xmm2[0],xmm6[1,2,3] 2291; SSE-NEXT: movdqa %xmm7, %xmm2 2292; SSE-NEXT: movdqa %xmm7, %xmm0 2293; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] 2294; SSE-NEXT: pand %xmm3, %xmm2 2295; SSE-NEXT: pandn %xmm11, %xmm3 2296; SSE-NEXT: por %xmm2, %xmm3 2297; SSE-NEXT: movdqa %xmm3, %xmm2 2298; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] 2299; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] 2300; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,0,65535,65535,65535] 2301; SSE-NEXT: pand %xmm4, %xmm3 2302; SSE-NEXT: pandn %xmm2, %xmm4 2303; SSE-NEXT: por %xmm3, %xmm4 2304; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 2305; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,1,0,3] 2306; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] 2307; SSE-NEXT: packuswb %xmm2, %xmm2 2308; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] 2309; SSE-NEXT: movdqa %xmm3, %xmm7 2310; SSE-NEXT: pandn %xmm2, %xmm7 2311; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,1,0,3] 2312; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] 2313; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] 2314; SSE-NEXT: packuswb %xmm2, %xmm2 2315; SSE-NEXT: pand %xmm3, %xmm2 2316; SSE-NEXT: por %xmm2, %xmm7 2317; SSE-NEXT: movdqa %xmm5, %xmm2 2318; SSE-NEXT: pandn %xmm7, %xmm2 2319; SSE-NEXT: andps %xmm5, %xmm6 2320; SSE-NEXT: por %xmm6, %xmm2 2321; SSE-NEXT: movdqa %xmm13, %xmm7 2322; SSE-NEXT: por %xmm9, %xmm7 2323; SSE-NEXT: movdqa %xmm7, %xmm4 2324; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] 2325; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,4,6,7] 2326; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15] 2327; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,1,2,1] 2328; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] 2329; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] 2330; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2331; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] 2332; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 2333; SSE-NEXT: pandn %xmm4, %xmm9 2334; SSE-NEXT: movdqa %xmm4, %xmm7 2335; SSE-NEXT: por %xmm9, %xmm14 2336; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,2,1,3] 2337; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] 2338; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] 2339; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] 2340; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,3,3,3] 2341; SSE-NEXT: packuswb %xmm7, %xmm4 2342; SSE-NEXT: packuswb %xmm6, %xmm6 2343; SSE-NEXT: movss {{.*#+}} xmm4 = xmm6[0],xmm4[1,2,3] 2344; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 2345; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 2346; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,3,2,3] 2347; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,2,2,3] 2348; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] 2349; SSE-NEXT: movdqa %xmm7, %xmm6 2350; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] 2351; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15] 2352; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,0,65535,65535] 2353; SSE-NEXT: pand %xmm9, %xmm7 2354; SSE-NEXT: pandn %xmm6, %xmm9 2355; SSE-NEXT: por %xmm7, %xmm9 2356; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[2,1,1,1] 2357; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] 2358; SSE-NEXT: packuswb %xmm6, %xmm6 2359; SSE-NEXT: pand %xmm3, %xmm6 2360; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[0,1,0,3] 2361; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] 2362; SSE-NEXT: packuswb %xmm7, %xmm7 2363; SSE-NEXT: pandn %xmm7, %xmm3 2364; SSE-NEXT: por %xmm3, %xmm6 2365; SSE-NEXT: andps %xmm5, %xmm4 2366; SSE-NEXT: pandn %xmm6, %xmm5 2367; SSE-NEXT: por %xmm4, %xmm5 2368; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 2369; SSE-NEXT: movaps %xmm3, (%rsi) 2370; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2371; SSE-NEXT: movaps %xmm0, (%rdx) 2372; SSE-NEXT: movdqa %xmm8, (%rcx) 2373; SSE-NEXT: movdqa %xmm15, (%r8) 2374; SSE-NEXT: movdqa %xmm1, (%r9) 2375; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 2376; SSE-NEXT: movdqa %xmm2, (%rax) 2377; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 2378; SSE-NEXT: movdqa %xmm5, (%rax) 2379; SSE-NEXT: addq $168, %rsp 2380; SSE-NEXT: retq 2381; 2382; AVX-LABEL: load_i8_stride7_vf16: 2383; AVX: # %bb.0: 2384; AVX-NEXT: vmovdqa (%rdi), %xmm2 2385; AVX-NEXT: vmovdqa 16(%rdi), %xmm7 2386; AVX-NEXT: vmovdqa 32(%rdi), %xmm3 2387; AVX-NEXT: vmovdqa 48(%rdi), %xmm4 2388; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u,u,u,u,u] 2389; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] 2390; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 2391; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] 2392; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] 2393; AVX-NEXT: vpor %xmm1, %xmm5, %xmm1 2394; AVX-NEXT: vmovq {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] 2395; AVX-NEXT: vpblendvb %xmm5, %xmm0, %xmm1, %xmm0 2396; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2397; AVX-NEXT: vmovdqa 80(%rdi), %xmm1 2398; AVX-NEXT: vmovdqa 64(%rdi), %xmm5 2399; AVX-NEXT: vmovdqa 96(%rdi), %xmm6 2400; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u] 2401; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[1,8,15],zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] 2402; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8 2403; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u,u,u] 2404; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[u,u,u,u,u,u,u] 2405; AVX-NEXT: vpor %xmm9, %xmm10, %xmm9 2406; AVX-NEXT: vmovq {{.*#+}} xmm11 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] 2407; AVX-NEXT: vpblendvb %xmm11, %xmm8, %xmm9, %xmm8 2408; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] 2409; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u] 2410; AVX-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] 2411; AVX-NEXT: vpxor %xmm12, %xmm12, %xmm12 2412; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm12[7] 2413; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[3,10] 2414; AVX-NEXT: vpor %xmm10, %xmm9, %xmm10 2415; AVX-NEXT: vpmovsxwq {{.*#+}} xmm9 = [18446744073709551615,255] 2416; AVX-NEXT: vpblendvb %xmm9, %xmm8, %xmm10, %xmm0 2417; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2418; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] 2419; AVX-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u,u,u,u,u] 2420; AVX-NEXT: vpor %xmm10, %xmm13, %xmm10 2421; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u],zero,zero,xmm4[3,10,u,u,u,u,u,u,u] 2422; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[u,u,u,u,u,u,u] 2423; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13 2424; AVX-NEXT: vpblendvb %xmm11, %xmm10, %xmm13, %xmm10 2425; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] 2426; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15] 2427; AVX-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] 2428; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6],xmm12[7] 2429; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[4,11] 2430; AVX-NEXT: vpor %xmm14, %xmm13, %xmm13 2431; AVX-NEXT: vpblendvb %xmm9, %xmm10, %xmm13, %xmm10 2432; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u] 2433; AVX-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm7[1,8,15,u,u,u,u,u,u,u,u,u,u,u] 2434; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13 2435; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u],zero,zero,xmm4[4,11,u,u,u,u,u,u,u] 2436; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[u,u,u,u,u,u,u] 2437; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14 2438; AVX-NEXT: vpblendvb %xmm11, %xmm13, %xmm14, %xmm11 2439; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm5[u,u] 2440; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14,u,u] 2441; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13 2442; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6],xmm12[7] 2443; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[5,12] 2444; AVX-NEXT: vpor %xmm14, %xmm13, %xmm13 2445; AVX-NEXT: vpblendvb %xmm9, %xmm11, %xmm13, %xmm11 2446; AVX-NEXT: vmovd {{.*#+}} xmm13 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 2447; AVX-NEXT: vpshufb %xmm13, %xmm2, %xmm14 2448; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 2449; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] 2450; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u] 2451; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u] 2452; AVX-NEXT: vpor %xmm0, %xmm15, %xmm0 2453; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3,4,5,6,7] 2454; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm5[u,u] 2455; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15,u,u] 2456; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14 2457; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3,4,5,6],xmm12[7] 2458; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[6,13] 2459; AVX-NEXT: vpor %xmm14, %xmm12, %xmm12 2460; AVX-NEXT: vpblendvb %xmm9, %xmm0, %xmm12, %xmm12 2461; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 2462; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 2463; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] 2464; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u],zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u] 2465; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u] 2466; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14 2467; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3,4,5,6,7] 2468; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[2,9,u,u,u] 2469; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm5[u,u,u] 2470; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14 2471; AVX-NEXT: vmovddup {{.*#+}} xmm15 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] 2472; AVX-NEXT: # xmm15 = mem[0,0] 2473; AVX-NEXT: vpshufb %xmm15, %xmm14, %xmm14 2474; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[0,7,14] 2475; AVX-NEXT: vpor %xmm8, %xmm14, %xmm8 2476; AVX-NEXT: vpblendvb %xmm9, %xmm0, %xmm8, %xmm0 2477; AVX-NEXT: vpshufb %xmm13, %xmm7, %xmm7 2478; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 2479; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] 2480; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] 2481; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u] 2482; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 2483; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7] 2484; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[3,10,u,u,u] 2485; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm5[u,u,u] 2486; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 2487; AVX-NEXT: vpshufb %xmm15, %xmm3, %xmm3 2488; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[1,8,15] 2489; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3 2490; AVX-NEXT: vpblendvb %xmm9, %xmm2, %xmm3, %xmm2 2491; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,4,11,u,u,u,u,u,u,u,u,u,u] 2492; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] 2493; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 2494; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9] 2495; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,0,1,2,3,8,15] 2496; AVX-NEXT: vpblendw $31, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 2497; AVX-NEXT: # xmm1 = mem[0,1,2,3,4],xmm1[5,6,7] 2498; AVX-NEXT: vmovdqa %xmm1, (%rsi) 2499; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2500; AVX-NEXT: vmovaps %xmm1, (%rdx) 2501; AVX-NEXT: vmovdqa %xmm10, (%rcx) 2502; AVX-NEXT: vmovdqa %xmm11, (%r8) 2503; AVX-NEXT: vmovdqa %xmm12, (%r9) 2504; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 2505; AVX-NEXT: vmovdqa %xmm0, (%rax) 2506; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 2507; AVX-NEXT: vmovdqa %xmm2, (%rax) 2508; AVX-NEXT: retq 2509; 2510; AVX2-LABEL: load_i8_stride7_vf16: 2511; AVX2: # %bb.0: 2512; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 2513; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 2514; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2515; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 2516; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 2517; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 2518; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 2519; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u] 2520; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u] 2521; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm3 2522; AVX2-NEXT: vmovdqa 96(%rdi), %xmm9 2523; AVX2-NEXT: vmovdqa 64(%rdi), %xmm10 2524; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0],xmm10[1],xmm9[2],xmm10[3] 2525; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9] 2526; AVX2-NEXT: vmovdqa 80(%rdi), %xmm2 2527; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero 2528; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm4 2529; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] 2530; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 2531; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 2532; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 2533; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] 2534; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u] 2535; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm6 2536; AVX2-NEXT: vmovdqa 96(%rdi), %xmm4 2537; AVX2-NEXT: vmovdqa 64(%rdi), %xmm5 2538; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3,4],xmm4[5],xmm5[6,7] 2539; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] 2540; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero 2541; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm8 2542; AVX2-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255] 2543; AVX2-NEXT: vpblendvb %xmm7, %xmm6, %xmm8, %xmm6 2544; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 2545; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm8 2546; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u] 2547; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm8 2548; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u] 2549; AVX2-NEXT: vpor %xmm11, %xmm8, %xmm8 2550; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] 2551; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11] 2552; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13],zero,zero 2553; AVX2-NEXT: vpor %xmm12, %xmm11, %xmm11 2554; AVX2-NEXT: vpblendvb %xmm7, %xmm8, %xmm11, %xmm8 2555; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] 2556; AVX2-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm11 2557; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u,u,u] 2558; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm11 2559; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u,u,u] 2560; AVX2-NEXT: vpor %xmm12, %xmm11, %xmm11 2561; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] 2562; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm10[5,12] 2563; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero 2564; AVX2-NEXT: vpor %xmm12, %xmm9, %xmm9 2565; AVX2-NEXT: vpblendvb %xmm7, %xmm11, %xmm9, %xmm9 2566; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 2567; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11 2568; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm12 2569; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] 2570; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u,u,u] 2571; AVX2-NEXT: vpor %xmm12, %xmm11, %xmm11 2572; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13] 2573; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero 2574; AVX2-NEXT: vpor %xmm12, %xmm10, %xmm10 2575; AVX2-NEXT: vpblendvb %xmm7, %xmm11, %xmm10, %xmm10 2576; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 2577; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11 2578; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm12 2579; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] 2580; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u,u,u,u] 2581; AVX2-NEXT: vpor %xmm12, %xmm11, %xmm11 2582; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6],xmm4[7] 2583; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm12[0,7,14] 2584; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[2,9],zero,zero,zero 2585; AVX2-NEXT: vpor %xmm13, %xmm12, %xmm12 2586; AVX2-NEXT: vpblendvb %xmm7, %xmm11, %xmm12, %xmm11 2587; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 2588; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 2589; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u] 2590; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 2591; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] 2592; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 2593; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6],xmm4[7] 2594; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm1[1,8,15] 2595; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[3,10],zero,zero,zero 2596; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 2597; AVX2-NEXT: vpblendvb %xmm7, %xmm0, %xmm1, %xmm0 2598; AVX2-NEXT: vmovdqa %xmm3, (%rsi) 2599; AVX2-NEXT: vmovdqa %xmm6, (%rdx) 2600; AVX2-NEXT: vmovdqa %xmm8, (%rcx) 2601; AVX2-NEXT: vmovdqa %xmm9, (%r8) 2602; AVX2-NEXT: vmovdqa %xmm10, (%r9) 2603; AVX2-NEXT: vmovdqa %xmm11, (%r10) 2604; AVX2-NEXT: vmovdqa %xmm0, (%rax) 2605; AVX2-NEXT: vzeroupper 2606; AVX2-NEXT: retq 2607; 2608; AVX2-FP-LABEL: load_i8_stride7_vf16: 2609; AVX2-FP: # %bb.0: 2610; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2611; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 2612; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 2613; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 2614; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 2615; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 2616; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 2617; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u] 2618; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u] 2619; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm3 2620; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm9 2621; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm10 2622; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0],xmm10[1],xmm9[2],xmm10[3] 2623; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9] 2624; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm2 2625; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero 2626; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm4 2627; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] 2628; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 2629; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 2630; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5 2631; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] 2632; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u] 2633; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm6 2634; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm4 2635; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm5 2636; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3,4],xmm4[5],xmm5[6,7] 2637; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] 2638; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero 2639; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm8 2640; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255] 2641; AVX2-FP-NEXT: vpblendvb %xmm7, %xmm6, %xmm8, %xmm6 2642; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 2643; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm8 2644; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u] 2645; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm8 2646; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u] 2647; AVX2-FP-NEXT: vpor %xmm11, %xmm8, %xmm8 2648; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] 2649; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11] 2650; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13],zero,zero 2651; AVX2-FP-NEXT: vpor %xmm12, %xmm11, %xmm11 2652; AVX2-FP-NEXT: vpblendvb %xmm7, %xmm8, %xmm11, %xmm8 2653; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] 2654; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm11 2655; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u,u,u] 2656; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm11 2657; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u,u,u] 2658; AVX2-FP-NEXT: vpor %xmm12, %xmm11, %xmm11 2659; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] 2660; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm10[5,12] 2661; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero 2662; AVX2-FP-NEXT: vpor %xmm12, %xmm9, %xmm9 2663; AVX2-FP-NEXT: vpblendvb %xmm7, %xmm11, %xmm9, %xmm9 2664; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 2665; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11 2666; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm12 2667; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] 2668; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u,u,u] 2669; AVX2-FP-NEXT: vpor %xmm12, %xmm11, %xmm11 2670; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13] 2671; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero 2672; AVX2-FP-NEXT: vpor %xmm12, %xmm10, %xmm10 2673; AVX2-FP-NEXT: vpblendvb %xmm7, %xmm11, %xmm10, %xmm10 2674; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 2675; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11 2676; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm12 2677; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] 2678; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u,u,u,u] 2679; AVX2-FP-NEXT: vpor %xmm12, %xmm11, %xmm11 2680; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6],xmm4[7] 2681; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm12[0,7,14] 2682; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[2,9],zero,zero,zero 2683; AVX2-FP-NEXT: vpor %xmm13, %xmm12, %xmm12 2684; AVX2-FP-NEXT: vpblendvb %xmm7, %xmm11, %xmm12, %xmm11 2685; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 2686; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 2687; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u] 2688; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0 2689; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] 2690; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 2691; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6],xmm4[7] 2692; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm1[1,8,15] 2693; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[3,10],zero,zero,zero 2694; AVX2-FP-NEXT: vpor %xmm2, %xmm1, %xmm1 2695; AVX2-FP-NEXT: vpblendvb %xmm7, %xmm0, %xmm1, %xmm0 2696; AVX2-FP-NEXT: vmovdqa %xmm3, (%rsi) 2697; AVX2-FP-NEXT: vmovdqa %xmm6, (%rdx) 2698; AVX2-FP-NEXT: vmovdqa %xmm8, (%rcx) 2699; AVX2-FP-NEXT: vmovdqa %xmm9, (%r8) 2700; AVX2-FP-NEXT: vmovdqa %xmm10, (%r9) 2701; AVX2-FP-NEXT: vmovdqa %xmm11, (%r10) 2702; AVX2-FP-NEXT: vmovdqa %xmm0, (%rax) 2703; AVX2-FP-NEXT: vzeroupper 2704; AVX2-FP-NEXT: retq 2705; 2706; AVX2-FCP-LABEL: load_i8_stride7_vf16: 2707; AVX2-FCP: # %bb.0: 2708; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2709; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 2710; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 2711; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 2712; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 2713; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 2714; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 2715; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u] 2716; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u] 2717; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm3 2718; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm9 2719; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm10 2720; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0],xmm10[1],xmm9[2],xmm10[3] 2721; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9] 2722; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm2 2723; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero 2724; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 2725; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] 2726; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 2727; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 2728; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 2729; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] 2730; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u] 2731; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm6 2732; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm4 2733; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 2734; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3,4],xmm4[5],xmm5[6,7] 2735; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] 2736; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero 2737; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm8 2738; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255] 2739; AVX2-FCP-NEXT: vpblendvb %xmm7, %xmm6, %xmm8, %xmm6 2740; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 2741; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm8 2742; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u] 2743; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 2744; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u] 2745; AVX2-FCP-NEXT: vpor %xmm11, %xmm8, %xmm8 2746; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] 2747; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11] 2748; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13],zero,zero 2749; AVX2-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 2750; AVX2-FCP-NEXT: vpblendvb %xmm7, %xmm8, %xmm11, %xmm8 2751; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] 2752; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm11 2753; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u,u,u] 2754; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 2755; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u,u,u] 2756; AVX2-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 2757; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] 2758; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm10[5,12] 2759; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero 2760; AVX2-FCP-NEXT: vpor %xmm12, %xmm9, %xmm9 2761; AVX2-FCP-NEXT: vpblendvb %xmm7, %xmm11, %xmm9, %xmm9 2762; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 2763; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11 2764; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 2765; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] 2766; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u,u,u] 2767; AVX2-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 2768; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13] 2769; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero 2770; AVX2-FCP-NEXT: vpor %xmm12, %xmm10, %xmm10 2771; AVX2-FCP-NEXT: vpblendvb %xmm7, %xmm11, %xmm10, %xmm10 2772; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 2773; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11 2774; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 2775; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] 2776; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u,u,u,u] 2777; AVX2-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 2778; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6],xmm4[7] 2779; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm12[0,7,14] 2780; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[2,9],zero,zero,zero 2781; AVX2-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 2782; AVX2-FCP-NEXT: vpblendvb %xmm7, %xmm11, %xmm12, %xmm11 2783; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 2784; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 2785; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u] 2786; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 2787; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] 2788; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 2789; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6],xmm4[7] 2790; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm1[1,8,15] 2791; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[3,10],zero,zero,zero 2792; AVX2-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 2793; AVX2-FCP-NEXT: vpblendvb %xmm7, %xmm0, %xmm1, %xmm0 2794; AVX2-FCP-NEXT: vmovdqa %xmm3, (%rsi) 2795; AVX2-FCP-NEXT: vmovdqa %xmm6, (%rdx) 2796; AVX2-FCP-NEXT: vmovdqa %xmm8, (%rcx) 2797; AVX2-FCP-NEXT: vmovdqa %xmm9, (%r8) 2798; AVX2-FCP-NEXT: vmovdqa %xmm10, (%r9) 2799; AVX2-FCP-NEXT: vmovdqa %xmm11, (%r10) 2800; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rax) 2801; AVX2-FCP-NEXT: vzeroupper 2802; AVX2-FCP-NEXT: retq 2803; 2804; AVX512-LABEL: load_i8_stride7_vf16: 2805; AVX512: # %bb.0: 2806; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 2807; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 2808; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] 2809; AVX512-NEXT: vmovdqa 80(%rdi), %xmm0 2810; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm4 2811; AVX512-NEXT: vmovdqa (%rdi), %ymm1 2812; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2 2813; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] 2814; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm2 ^ (ymm5 & (ymm1 ^ ymm2)) 2815; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 2816; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm3 2817; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7] 2818; AVX512-NEXT: vmovdqa 96(%rdi), %xmm3 2819; AVX512-NEXT: vmovdqa 64(%rdi), %xmm4 2820; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] 2821; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9] 2822; AVX512-NEXT: vpshufb %xmm8, %xmm7, %xmm7 2823; AVX512-NEXT: vpshufb %xmm8, %xmm5, %xmm5 2824; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] 2825; AVX512-NEXT: vpor %xmm6, %xmm5, %xmm5 2826; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] 2827; AVX512-NEXT: vmovdqa %ymm8, %ymm6 2828; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm1 ^ ymm2)) 2829; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7 2830; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] 2831; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] 2832; AVX512-NEXT: vpor %xmm7, %xmm6, %xmm9 2833; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] 2834; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10] 2835; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero 2836; AVX512-NEXT: vpor %xmm7, %xmm6, %xmm6 2837; AVX512-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255] 2838; AVX512-NEXT: vpternlogq {{.*#+}} xmm6 = xmm6 ^ (xmm7 & (xmm6 ^ xmm9)) 2839; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] 2840; AVX512-NEXT: vmovdqa %ymm9, %ymm10 2841; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm2 ^ (ymm10 & (ymm1 ^ ymm2)) 2842; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u,u,u] 2843; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm10 2844; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u,u,u] 2845; AVX512-NEXT: vpor %xmm11, %xmm10, %xmm10 2846; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] 2847; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11] 2848; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero 2849; AVX512-NEXT: vpor %xmm12, %xmm11, %xmm11 2850; AVX512-NEXT: vpternlogq {{.*#+}} xmm11 = xmm11 ^ (xmm7 & (xmm11 ^ xmm10)) 2851; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] 2852; AVX512-NEXT: vmovdqa %ymm10, %ymm12 2853; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm2 ^ (ymm12 & (ymm1 ^ ymm2)) 2854; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u] 2855; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm12 2856; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u] 2857; AVX512-NEXT: vpor %xmm13, %xmm12, %xmm12 2858; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] 2859; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm13[5,12] 2860; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero 2861; AVX512-NEXT: vpor %xmm15, %xmm14, %xmm14 2862; AVX512-NEXT: vpternlogq {{.*#+}} xmm14 = xmm14 ^ (xmm7 & (xmm14 ^ xmm12)) 2863; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm2 ^ ymm1)) 2864; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm12 2865; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] 2866; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u] 2867; AVX512-NEXT: vpor %xmm12, %xmm8, %xmm8 2868; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm13[6,13] 2869; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero 2870; AVX512-NEXT: vpor %xmm13, %xmm12, %xmm12 2871; AVX512-NEXT: vpternlogq {{.*#+}} xmm12 = xmm12 ^ (xmm7 & (xmm12 ^ xmm8)) 2872; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm2 ^ ymm1)) 2873; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm8 2874; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u] 2875; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] 2876; AVX512-NEXT: vpor %xmm8, %xmm9, %xmm8 2877; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7] 2878; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm9[0,7,14] 2879; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero 2880; AVX512-NEXT: vpor %xmm13, %xmm9, %xmm9 2881; AVX512-NEXT: vpternlogq {{.*#+}} xmm9 = xmm9 ^ (xmm7 & (xmm9 ^ xmm8)) 2882; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm1 ^ (ymm10 & (ymm2 ^ ymm1)) 2883; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] 2884; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm2 2885; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] 2886; AVX512-NEXT: vpor %xmm1, %xmm2, %xmm1 2887; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] 2888; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15] 2889; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero 2890; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 2891; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (xmm7 & (xmm0 ^ xmm1)) 2892; AVX512-NEXT: vmovdqa %xmm5, (%rsi) 2893; AVX512-NEXT: vmovdqa %xmm6, (%rdx) 2894; AVX512-NEXT: vmovdqa %xmm11, (%rcx) 2895; AVX512-NEXT: vmovdqa %xmm14, (%r8) 2896; AVX512-NEXT: vmovdqa %xmm12, (%r9) 2897; AVX512-NEXT: vmovdqa %xmm9, (%r10) 2898; AVX512-NEXT: vmovdqa %xmm0, (%rax) 2899; AVX512-NEXT: vzeroupper 2900; AVX512-NEXT: retq 2901; 2902; AVX512-FCP-LABEL: load_i8_stride7_vf16: 2903; AVX512-FCP: # %bb.0: 2904; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2905; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 2906; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] 2907; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm0 2908; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4 2909; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 2910; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 2911; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] 2912; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm2 ^ (ymm5 & (ymm1 ^ ymm2)) 2913; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 2914; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3 2915; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7] 2916; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm3 2917; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 2918; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] 2919; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9] 2920; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7 2921; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5 2922; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] 2923; AVX512-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 2924; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] 2925; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm6 2926; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm1 ^ ymm2)) 2927; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 2928; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] 2929; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] 2930; AVX512-FCP-NEXT: vpor %xmm7, %xmm6, %xmm9 2931; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] 2932; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10] 2933; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero 2934; AVX512-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 2935; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255] 2936; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm6 = xmm6 ^ (xmm7 & (xmm6 ^ xmm9)) 2937; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] 2938; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm10 2939; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm2 ^ (ymm10 & (ymm1 ^ ymm2)) 2940; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u,u,u] 2941; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 2942; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u,u,u] 2943; AVX512-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 2944; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] 2945; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11] 2946; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero 2947; AVX512-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 2948; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm11 = xmm11 ^ (xmm7 & (xmm11 ^ xmm10)) 2949; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] 2950; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm12 2951; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm2 ^ (ymm12 & (ymm1 ^ ymm2)) 2952; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u] 2953; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 2954; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u] 2955; AVX512-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 2956; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] 2957; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm13[5,12] 2958; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero 2959; AVX512-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 2960; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm14 = xmm14 ^ (xmm7 & (xmm14 ^ xmm12)) 2961; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm2 ^ ymm1)) 2962; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm12 2963; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] 2964; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u] 2965; AVX512-FCP-NEXT: vpor %xmm12, %xmm8, %xmm8 2966; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm13[6,13] 2967; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero 2968; AVX512-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 2969; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm12 = xmm12 ^ (xmm7 & (xmm12 ^ xmm8)) 2970; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm2 ^ ymm1)) 2971; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm8 2972; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u] 2973; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] 2974; AVX512-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 2975; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7] 2976; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm9[0,7,14] 2977; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero 2978; AVX512-FCP-NEXT: vpor %xmm13, %xmm9, %xmm9 2979; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm9 = xmm9 ^ (xmm7 & (xmm9 ^ xmm8)) 2980; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm1 ^ (ymm10 & (ymm2 ^ ymm1)) 2981; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] 2982; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm2 2983; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] 2984; AVX512-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 2985; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] 2986; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15] 2987; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero 2988; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 2989; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (xmm7 & (xmm0 ^ xmm1)) 2990; AVX512-FCP-NEXT: vmovdqa %xmm5, (%rsi) 2991; AVX512-FCP-NEXT: vmovdqa %xmm6, (%rdx) 2992; AVX512-FCP-NEXT: vmovdqa %xmm11, (%rcx) 2993; AVX512-FCP-NEXT: vmovdqa %xmm14, (%r8) 2994; AVX512-FCP-NEXT: vmovdqa %xmm12, (%r9) 2995; AVX512-FCP-NEXT: vmovdqa %xmm9, (%r10) 2996; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rax) 2997; AVX512-FCP-NEXT: vzeroupper 2998; AVX512-FCP-NEXT: retq 2999; 3000; AVX512DQ-LABEL: load_i8_stride7_vf16: 3001; AVX512DQ: # %bb.0: 3002; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 3003; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 3004; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] 3005; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm0 3006; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm4 3007; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 3008; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm2 3009; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] 3010; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm2 ^ (ymm5 & (ymm1 ^ ymm2)) 3011; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 3012; AVX512DQ-NEXT: vpshufb %xmm3, %xmm6, %xmm3 3013; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7] 3014; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm3 3015; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm4 3016; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] 3017; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9] 3018; AVX512DQ-NEXT: vpshufb %xmm8, %xmm7, %xmm7 3019; AVX512DQ-NEXT: vpshufb %xmm8, %xmm5, %xmm5 3020; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] 3021; AVX512DQ-NEXT: vpor %xmm6, %xmm5, %xmm5 3022; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] 3023; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm6 3024; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm1 ^ ymm2)) 3025; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm7 3026; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] 3027; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] 3028; AVX512DQ-NEXT: vpor %xmm7, %xmm6, %xmm9 3029; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] 3030; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10] 3031; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero 3032; AVX512DQ-NEXT: vpor %xmm7, %xmm6, %xmm6 3033; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255] 3034; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm6 = xmm6 ^ (xmm7 & (xmm6 ^ xmm9)) 3035; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] 3036; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm10 3037; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm2 ^ (ymm10 & (ymm1 ^ ymm2)) 3038; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u,u,u] 3039; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm10 3040; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u,u,u] 3041; AVX512DQ-NEXT: vpor %xmm11, %xmm10, %xmm10 3042; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] 3043; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11] 3044; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero 3045; AVX512DQ-NEXT: vpor %xmm12, %xmm11, %xmm11 3046; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm11 = xmm11 ^ (xmm7 & (xmm11 ^ xmm10)) 3047; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] 3048; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm12 3049; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm2 ^ (ymm12 & (ymm1 ^ ymm2)) 3050; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u] 3051; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm12 3052; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u] 3053; AVX512DQ-NEXT: vpor %xmm13, %xmm12, %xmm12 3054; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] 3055; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm13[5,12] 3056; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero 3057; AVX512DQ-NEXT: vpor %xmm15, %xmm14, %xmm14 3058; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm14 = xmm14 ^ (xmm7 & (xmm14 ^ xmm12)) 3059; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm2 ^ ymm1)) 3060; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm12 3061; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] 3062; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u] 3063; AVX512DQ-NEXT: vpor %xmm12, %xmm8, %xmm8 3064; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm13[6,13] 3065; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero 3066; AVX512DQ-NEXT: vpor %xmm13, %xmm12, %xmm12 3067; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm12 = xmm12 ^ (xmm7 & (xmm12 ^ xmm8)) 3068; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm2 ^ ymm1)) 3069; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm8 3070; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u] 3071; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] 3072; AVX512DQ-NEXT: vpor %xmm8, %xmm9, %xmm8 3073; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7] 3074; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm9[0,7,14] 3075; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero 3076; AVX512DQ-NEXT: vpor %xmm13, %xmm9, %xmm9 3077; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm9 = xmm9 ^ (xmm7 & (xmm9 ^ xmm8)) 3078; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm1 ^ (ymm10 & (ymm2 ^ ymm1)) 3079; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] 3080; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm2 3081; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] 3082; AVX512DQ-NEXT: vpor %xmm1, %xmm2, %xmm1 3083; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] 3084; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15] 3085; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero 3086; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0 3087; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (xmm7 & (xmm0 ^ xmm1)) 3088; AVX512DQ-NEXT: vmovdqa %xmm5, (%rsi) 3089; AVX512DQ-NEXT: vmovdqa %xmm6, (%rdx) 3090; AVX512DQ-NEXT: vmovdqa %xmm11, (%rcx) 3091; AVX512DQ-NEXT: vmovdqa %xmm14, (%r8) 3092; AVX512DQ-NEXT: vmovdqa %xmm12, (%r9) 3093; AVX512DQ-NEXT: vmovdqa %xmm9, (%r10) 3094; AVX512DQ-NEXT: vmovdqa %xmm0, (%rax) 3095; AVX512DQ-NEXT: vzeroupper 3096; AVX512DQ-NEXT: retq 3097; 3098; AVX512DQ-FCP-LABEL: load_i8_stride7_vf16: 3099; AVX512DQ-FCP: # %bb.0: 3100; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3101; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 3102; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] 3103; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm0 3104; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4 3105; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 3106; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 3107; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] 3108; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm2 ^ (ymm5 & (ymm1 ^ ymm2)) 3109; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 3110; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3 3111; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7] 3112; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm3 3113; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 3114; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] 3115; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9] 3116; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7 3117; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5 3118; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] 3119; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 3120; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] 3121; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm6 3122; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm1 ^ ymm2)) 3123; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 3124; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] 3125; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] 3126; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm6, %xmm9 3127; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] 3128; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10] 3129; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero 3130; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 3131; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255] 3132; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm6 = xmm6 ^ (xmm7 & (xmm6 ^ xmm9)) 3133; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] 3134; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm10 3135; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm2 ^ (ymm10 & (ymm1 ^ ymm2)) 3136; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u,u,u] 3137; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 3138; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u,u,u] 3139; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 3140; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] 3141; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11] 3142; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero 3143; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 3144; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm11 = xmm11 ^ (xmm7 & (xmm11 ^ xmm10)) 3145; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] 3146; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm12 3147; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm2 ^ (ymm12 & (ymm1 ^ ymm2)) 3148; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u] 3149; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 3150; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u] 3151; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 3152; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] 3153; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm13[5,12] 3154; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero 3155; AVX512DQ-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 3156; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm14 = xmm14 ^ (xmm7 & (xmm14 ^ xmm12)) 3157; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm2 ^ ymm1)) 3158; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm12 3159; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] 3160; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u] 3161; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm8, %xmm8 3162; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm13[6,13] 3163; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero 3164; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 3165; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm12 = xmm12 ^ (xmm7 & (xmm12 ^ xmm8)) 3166; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm2 ^ ymm1)) 3167; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm8 3168; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u] 3169; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] 3170; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 3171; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7] 3172; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm9[0,7,14] 3173; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero 3174; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm9, %xmm9 3175; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm9 = xmm9 ^ (xmm7 & (xmm9 ^ xmm8)) 3176; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm1 ^ (ymm10 & (ymm2 ^ ymm1)) 3177; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] 3178; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm2 3179; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] 3180; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 3181; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] 3182; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15] 3183; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero 3184; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 3185; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (xmm7 & (xmm0 ^ xmm1)) 3186; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%rsi) 3187; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%rdx) 3188; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, (%rcx) 3189; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, (%r8) 3190; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, (%r9) 3191; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, (%r10) 3192; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rax) 3193; AVX512DQ-FCP-NEXT: vzeroupper 3194; AVX512DQ-FCP-NEXT: retq 3195; 3196; AVX512BW-LABEL: load_i8_stride7_vf16: 3197; AVX512BW: # %bb.0: 3198; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 3199; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 3200; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] 3201; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm0 3202; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm4 3203; AVX512BW-NEXT: vmovdqa (%rdi), %ymm2 3204; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 3205; AVX512BW-NEXT: movw $-28382, %r11w # imm = 0x9122 3206; AVX512BW-NEXT: kmovd %r11d, %k1 3207; AVX512BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm5 {%k1} 3208; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm6 3209; AVX512BW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 3210; AVX512BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7] 3211; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm3 3212; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm4 3213; AVX512BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] 3214; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9] 3215; AVX512BW-NEXT: vpshufb %xmm8, %xmm7, %xmm7 3216; AVX512BW-NEXT: vpshufb %xmm8, %xmm5, %xmm5 3217; AVX512BW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] 3218; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm5 3219; AVX512BW-NEXT: movw $4644, %di # imm = 0x1224 3220; AVX512BW-NEXT: kmovd %edi, %k2 3221; AVX512BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm6 {%k2} 3222; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7 3223; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] 3224; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] 3225; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6 3226; AVX512BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] 3227; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] 3228; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero 3229; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 3230; AVX512BW-NEXT: movw $-512, %di # imm = 0xFE00 3231; AVX512BW-NEXT: kmovd %edi, %k1 3232; AVX512BW-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1} 3233; AVX512BW-NEXT: movw $8772, %di # imm = 0x2244 3234; AVX512BW-NEXT: kmovd %edi, %k3 3235; AVX512BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm7 {%k3} 3236; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u] 3237; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm7 3238; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] 3239; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 3240; AVX512BW-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] 3241; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11] 3242; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero 3243; AVX512BW-NEXT: vpor %xmm9, %xmm8, %xmm8 3244; AVX512BW-NEXT: vmovdqu8 %xmm8, %xmm7 {%k1} 3245; AVX512BW-NEXT: movw $9288, %di # imm = 0x2448 3246; AVX512BW-NEXT: kmovd %edi, %k4 3247; AVX512BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm8 {%k4} 3248; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u] 3249; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm8 3250; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u] 3251; AVX512BW-NEXT: vpor %xmm9, %xmm8, %xmm8 3252; AVX512BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] 3253; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12] 3254; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero 3255; AVX512BW-NEXT: vpor %xmm11, %xmm10, %xmm10 3256; AVX512BW-NEXT: vmovdqu8 %xmm10, %xmm8 {%k1} 3257; AVX512BW-NEXT: vpblendmw %ymm2, %ymm1, %ymm10 {%k2} 3258; AVX512BW-NEXT: vextracti128 $1, %ymm10, %xmm11 3259; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] 3260; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u] 3261; AVX512BW-NEXT: vpor %xmm11, %xmm10, %xmm10 3262; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13] 3263; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero 3264; AVX512BW-NEXT: vpor %xmm11, %xmm9, %xmm9 3265; AVX512BW-NEXT: vmovdqu8 %xmm9, %xmm10 {%k1} 3266; AVX512BW-NEXT: vpblendmw %ymm2, %ymm1, %ymm9 {%k3} 3267; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm11 3268; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] 3269; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] 3270; AVX512BW-NEXT: vpor %xmm11, %xmm9, %xmm9 3271; AVX512BW-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7] 3272; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14] 3273; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero 3274; AVX512BW-NEXT: vpor %xmm12, %xmm11, %xmm11 3275; AVX512BW-NEXT: vmovdqu8 %xmm11, %xmm9 {%k1} 3276; AVX512BW-NEXT: vmovdqu16 %ymm2, %ymm1 {%k4} 3277; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] 3278; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 3279; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] 3280; AVX512BW-NEXT: vpor %xmm2, %xmm1, %xmm1 3281; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] 3282; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15] 3283; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero 3284; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 3285; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} 3286; AVX512BW-NEXT: vmovdqa %xmm5, (%rsi) 3287; AVX512BW-NEXT: vmovdqa %xmm6, (%rdx) 3288; AVX512BW-NEXT: vmovdqa %xmm7, (%rcx) 3289; AVX512BW-NEXT: vmovdqa %xmm8, (%r8) 3290; AVX512BW-NEXT: vmovdqa %xmm10, (%r9) 3291; AVX512BW-NEXT: vmovdqa %xmm9, (%r10) 3292; AVX512BW-NEXT: vmovdqa %xmm1, (%rax) 3293; AVX512BW-NEXT: vzeroupper 3294; AVX512BW-NEXT: retq 3295; 3296; AVX512BW-FCP-LABEL: load_i8_stride7_vf16: 3297; AVX512BW-FCP: # %bb.0: 3298; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3299; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 3300; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] 3301; AVX512BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm0 3302; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4 3303; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm2 3304; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 3305; AVX512BW-FCP-NEXT: movw $-28382, %r11w # imm = 0x9122 3306; AVX512BW-FCP-NEXT: kmovd %r11d, %k1 3307; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm5 {%k1} 3308; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 3309; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3 3310; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7] 3311; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm3 3312; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 3313; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] 3314; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9] 3315; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7 3316; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5 3317; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] 3318; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 3319; AVX512BW-FCP-NEXT: movw $4644, %di # imm = 0x1224 3320; AVX512BW-FCP-NEXT: kmovd %edi, %k2 3321; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm6 {%k2} 3322; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 3323; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] 3324; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] 3325; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 3326; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] 3327; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] 3328; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero 3329; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 3330; AVX512BW-FCP-NEXT: movw $-512, %di # imm = 0xFE00 3331; AVX512BW-FCP-NEXT: kmovd %edi, %k1 3332; AVX512BW-FCP-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1} 3333; AVX512BW-FCP-NEXT: movw $8772, %di # imm = 0x2244 3334; AVX512BW-FCP-NEXT: kmovd %edi, %k3 3335; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm7 {%k3} 3336; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u] 3337; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 3338; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] 3339; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 3340; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] 3341; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11] 3342; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero 3343; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 3344; AVX512BW-FCP-NEXT: vmovdqu8 %xmm8, %xmm7 {%k1} 3345; AVX512BW-FCP-NEXT: movw $9288, %di # imm = 0x2448 3346; AVX512BW-FCP-NEXT: kmovd %edi, %k4 3347; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm8 {%k4} 3348; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u] 3349; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 3350; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u] 3351; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 3352; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] 3353; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12] 3354; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero 3355; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 3356; AVX512BW-FCP-NEXT: vmovdqu8 %xmm10, %xmm8 {%k1} 3357; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm1, %ymm10 {%k2} 3358; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 3359; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] 3360; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u] 3361; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 3362; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13] 3363; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero 3364; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9 3365; AVX512BW-FCP-NEXT: vmovdqu8 %xmm9, %xmm10 {%k1} 3366; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm1, %ymm9 {%k3} 3367; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm11 3368; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] 3369; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] 3370; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9 3371; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7] 3372; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14] 3373; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero 3374; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 3375; AVX512BW-FCP-NEXT: vmovdqu8 %xmm11, %xmm9 {%k1} 3376; AVX512BW-FCP-NEXT: vmovdqu16 %ymm2, %ymm1 {%k4} 3377; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] 3378; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 3379; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] 3380; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 3381; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] 3382; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15] 3383; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero 3384; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 3385; AVX512BW-FCP-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} 3386; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%rsi) 3387; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%rdx) 3388; AVX512BW-FCP-NEXT: vmovdqa %xmm7, (%rcx) 3389; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%r8) 3390; AVX512BW-FCP-NEXT: vmovdqa %xmm10, (%r9) 3391; AVX512BW-FCP-NEXT: vmovdqa %xmm9, (%r10) 3392; AVX512BW-FCP-NEXT: vmovdqa %xmm1, (%rax) 3393; AVX512BW-FCP-NEXT: vzeroupper 3394; AVX512BW-FCP-NEXT: retq 3395; 3396; AVX512DQ-BW-LABEL: load_i8_stride7_vf16: 3397; AVX512DQ-BW: # %bb.0: 3398; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 3399; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 3400; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] 3401; AVX512DQ-BW-NEXT: vmovdqa 80(%rdi), %xmm0 3402; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm0, %xmm4 3403; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm2 3404; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm1 3405; AVX512DQ-BW-NEXT: movw $-28382, %r11w # imm = 0x9122 3406; AVX512DQ-BW-NEXT: kmovd %r11d, %k1 3407; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm5 {%k1} 3408; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm6 3409; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 3410; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7] 3411; AVX512DQ-BW-NEXT: vmovdqa 96(%rdi), %xmm3 3412; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm4 3413; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] 3414; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9] 3415; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm7, %xmm7 3416; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm5, %xmm5 3417; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] 3418; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm5 3419; AVX512DQ-BW-NEXT: movw $4644, %di # imm = 0x1224 3420; AVX512DQ-BW-NEXT: kmovd %edi, %k2 3421; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm6 {%k2} 3422; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm7 3423; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] 3424; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] 3425; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm6, %xmm6 3426; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] 3427; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] 3428; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero 3429; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7 3430; AVX512DQ-BW-NEXT: movw $-512, %di # imm = 0xFE00 3431; AVX512DQ-BW-NEXT: kmovd %edi, %k1 3432; AVX512DQ-BW-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1} 3433; AVX512DQ-BW-NEXT: movw $8772, %di # imm = 0x2244 3434; AVX512DQ-BW-NEXT: kmovd %edi, %k3 3435; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm7 {%k3} 3436; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u] 3437; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm7 3438; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] 3439; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7 3440; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] 3441; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11] 3442; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero 3443; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm8, %xmm8 3444; AVX512DQ-BW-NEXT: vmovdqu8 %xmm8, %xmm7 {%k1} 3445; AVX512DQ-BW-NEXT: movw $9288, %di # imm = 0x2448 3446; AVX512DQ-BW-NEXT: kmovd %edi, %k4 3447; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm8 {%k4} 3448; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u] 3449; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm8, %xmm8 3450; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u] 3451; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm8, %xmm8 3452; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] 3453; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12] 3454; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero 3455; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm10, %xmm10 3456; AVX512DQ-BW-NEXT: vmovdqu8 %xmm10, %xmm8 {%k1} 3457; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm1, %ymm10 {%k2} 3458; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm10, %xmm11 3459; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] 3460; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u] 3461; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm10, %xmm10 3462; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13] 3463; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero 3464; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm9, %xmm9 3465; AVX512DQ-BW-NEXT: vmovdqu8 %xmm9, %xmm10 {%k1} 3466; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm1, %ymm9 {%k3} 3467; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm9, %xmm11 3468; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] 3469; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] 3470; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm9, %xmm9 3471; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7] 3472; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14] 3473; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero 3474; AVX512DQ-BW-NEXT: vpor %xmm12, %xmm11, %xmm11 3475; AVX512DQ-BW-NEXT: vmovdqu8 %xmm11, %xmm9 {%k1} 3476; AVX512DQ-BW-NEXT: vmovdqu16 %ymm2, %ymm1 {%k4} 3477; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] 3478; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm1 3479; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] 3480; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm1, %xmm1 3481; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] 3482; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15] 3483; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero 3484; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm2, %xmm0 3485; AVX512DQ-BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} 3486; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%rsi) 3487; AVX512DQ-BW-NEXT: vmovdqa %xmm6, (%rdx) 3488; AVX512DQ-BW-NEXT: vmovdqa %xmm7, (%rcx) 3489; AVX512DQ-BW-NEXT: vmovdqa %xmm8, (%r8) 3490; AVX512DQ-BW-NEXT: vmovdqa %xmm10, (%r9) 3491; AVX512DQ-BW-NEXT: vmovdqa %xmm9, (%r10) 3492; AVX512DQ-BW-NEXT: vmovdqa %xmm1, (%rax) 3493; AVX512DQ-BW-NEXT: vzeroupper 3494; AVX512DQ-BW-NEXT: retq 3495; 3496; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf16: 3497; AVX512DQ-BW-FCP: # %bb.0: 3498; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3499; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 3500; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128] 3501; AVX512DQ-BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm0 3502; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4 3503; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm2 3504; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 3505; AVX512DQ-BW-FCP-NEXT: movw $-28382, %r11w # imm = 0x9122 3506; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1 3507; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm5 {%k1} 3508; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 3509; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3 3510; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7] 3511; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm3 3512; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 3513; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] 3514; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9] 3515; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7 3516; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5 3517; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] 3518; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 3519; AVX512DQ-BW-FCP-NEXT: movw $4644, %di # imm = 0x1224 3520; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 3521; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm6 {%k2} 3522; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 3523; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] 3524; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] 3525; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 3526; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] 3527; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] 3528; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero 3529; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 3530; AVX512DQ-BW-FCP-NEXT: movw $-512, %di # imm = 0xFE00 3531; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 3532; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1} 3533; AVX512DQ-BW-FCP-NEXT: movw $8772, %di # imm = 0x2244 3534; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k3 3535; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm7 {%k3} 3536; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u] 3537; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 3538; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] 3539; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 3540; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] 3541; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11] 3542; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero 3543; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 3544; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm8, %xmm7 {%k1} 3545; AVX512DQ-BW-FCP-NEXT: movw $9288, %di # imm = 0x2448 3546; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k4 3547; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm8 {%k4} 3548; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u] 3549; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 3550; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u] 3551; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 3552; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] 3553; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12] 3554; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero 3555; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 3556; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm10, %xmm8 {%k1} 3557; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm1, %ymm10 {%k2} 3558; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 3559; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] 3560; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u] 3561; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 3562; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13] 3563; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero 3564; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9 3565; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm9, %xmm10 {%k1} 3566; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm1, %ymm9 {%k3} 3567; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm11 3568; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] 3569; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] 3570; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9 3571; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7] 3572; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14] 3573; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero 3574; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 3575; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm11, %xmm9 {%k1} 3576; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm2, %ymm1 {%k4} 3577; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] 3578; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 3579; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] 3580; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 3581; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] 3582; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15] 3583; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero 3584; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 3585; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} 3586; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%rsi) 3587; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%rdx) 3588; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm7, (%rcx) 3589; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%r8) 3590; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm10, (%r9) 3591; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm9, (%r10) 3592; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, (%rax) 3593; AVX512DQ-BW-FCP-NEXT: vzeroupper 3594; AVX512DQ-BW-FCP-NEXT: retq 3595 %wide.vec = load <112 x i8>, ptr %in.vec, align 64 3596 %strided.vec0 = shufflevector <112 x i8> %wide.vec, <112 x i8> poison, <16 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105> 3597 %strided.vec1 = shufflevector <112 x i8> %wide.vec, <112 x i8> poison, <16 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106> 3598 %strided.vec2 = shufflevector <112 x i8> %wide.vec, <112 x i8> poison, <16 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107> 3599 %strided.vec3 = shufflevector <112 x i8> %wide.vec, <112 x i8> poison, <16 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108> 3600 %strided.vec4 = shufflevector <112 x i8> %wide.vec, <112 x i8> poison, <16 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109> 3601 %strided.vec5 = shufflevector <112 x i8> %wide.vec, <112 x i8> poison, <16 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110> 3602 %strided.vec6 = shufflevector <112 x i8> %wide.vec, <112 x i8> poison, <16 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111> 3603 store <16 x i8> %strided.vec0, ptr %out.vec0, align 64 3604 store <16 x i8> %strided.vec1, ptr %out.vec1, align 64 3605 store <16 x i8> %strided.vec2, ptr %out.vec2, align 64 3606 store <16 x i8> %strided.vec3, ptr %out.vec3, align 64 3607 store <16 x i8> %strided.vec4, ptr %out.vec4, align 64 3608 store <16 x i8> %strided.vec5, ptr %out.vec5, align 64 3609 store <16 x i8> %strided.vec6, ptr %out.vec6, align 64 3610 ret void 3611} 3612 3613define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { 3614; SSE-LABEL: load_i8_stride7_vf32: 3615; SSE: # %bb.0: 3616; SSE-NEXT: subq $648, %rsp # imm = 0x288 3617; SSE-NEXT: movdqa 208(%rdi), %xmm14 3618; SSE-NEXT: movdqa 192(%rdi), %xmm5 3619; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3620; SSE-NEXT: movdqa 176(%rdi), %xmm13 3621; SSE-NEXT: movdqa 112(%rdi), %xmm4 3622; SSE-NEXT: movdqa 128(%rdi), %xmm11 3623; SSE-NEXT: movdqa 160(%rdi), %xmm7 3624; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3625; SSE-NEXT: movdqa 144(%rdi), %xmm1 3626; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3627; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535] 3628; SSE-NEXT: movdqa %xmm2, %xmm0 3629; SSE-NEXT: pandn %xmm1, %xmm0 3630; SSE-NEXT: movdqa %xmm7, %xmm1 3631; SSE-NEXT: pand %xmm2, %xmm1 3632; SSE-NEXT: movdqa %xmm2, %xmm9 3633; SSE-NEXT: por %xmm0, %xmm1 3634; SSE-NEXT: pxor %xmm10, %xmm10 3635; SSE-NEXT: movdqa %xmm1, %xmm0 3636; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] 3637; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] 3638; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 3639; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 3640; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] 3641; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,5,6] 3642; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] 3643; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] 3644; SSE-NEXT: packuswb %xmm0, %xmm2 3645; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] 3646; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,65535,65535,0,65535] 3647; SSE-NEXT: movdqa %xmm7, %xmm1 3648; SSE-NEXT: pandn %xmm11, %xmm1 3649; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3650; SSE-NEXT: movdqa %xmm4, %xmm3 3651; SSE-NEXT: movdqa %xmm4, %xmm12 3652; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3653; SSE-NEXT: pand %xmm7, %xmm3 3654; SSE-NEXT: movdqa %xmm7, %xmm8 3655; SSE-NEXT: por %xmm1, %xmm3 3656; SSE-NEXT: movdqa %xmm3, %xmm1 3657; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] 3658; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,65535] 3659; SSE-NEXT: movdqa %xmm7, %xmm4 3660; SSE-NEXT: pandn %xmm1, %xmm4 3661; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] 3662; SSE-NEXT: pand %xmm7, %xmm3 3663; SSE-NEXT: movdqa %xmm7, %xmm15 3664; SSE-NEXT: por %xmm4, %xmm3 3665; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3] 3666; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] 3667; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] 3668; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] 3669; SSE-NEXT: packuswb %xmm1, %xmm1 3670; SSE-NEXT: pand %xmm0, %xmm1 3671; SSE-NEXT: movdqa %xmm0, %xmm3 3672; SSE-NEXT: pandn %xmm2, %xmm3 3673; SSE-NEXT: por %xmm3, %xmm1 3674; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,0,65535] 3675; SSE-NEXT: movdqa %xmm7, %xmm2 3676; SSE-NEXT: pandn %xmm13, %xmm2 3677; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3678; SSE-NEXT: movdqa %xmm5, %xmm3 3679; SSE-NEXT: pand %xmm7, %xmm3 3680; SSE-NEXT: por %xmm2, %xmm3 3681; SSE-NEXT: movdqa %xmm3, %xmm2 3682; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] 3683; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] 3684; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] 3685; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] 3686; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] 3687; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 3688; SSE-NEXT: movdqa %xmm14, %xmm3 3689; SSE-NEXT: movdqa %xmm14, %xmm4 3690; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] 3691; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3692; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] 3693; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3694; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] 3695; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] 3696; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] 3697; SSE-NEXT: packuswb %xmm3, %xmm3 3698; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0] 3699; SSE-NEXT: movdqa %xmm5, %xmm4 3700; SSE-NEXT: pandn %xmm3, %xmm4 3701; SSE-NEXT: packuswb %xmm2, %xmm2 3702; SSE-NEXT: pand %xmm5, %xmm2 3703; SSE-NEXT: movdqa %xmm5, %xmm6 3704; SSE-NEXT: por %xmm2, %xmm4 3705; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,0,0] 3706; SSE-NEXT: movdqa %xmm5, %xmm2 3707; SSE-NEXT: pandn %xmm4, %xmm2 3708; SSE-NEXT: pand %xmm5, %xmm1 3709; SSE-NEXT: por %xmm1, %xmm2 3710; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3711; SSE-NEXT: movdqa 32(%rdi), %xmm2 3712; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3713; SSE-NEXT: movdqa %xmm9, %xmm1 3714; SSE-NEXT: pandn %xmm2, %xmm1 3715; SSE-NEXT: movdqa 48(%rdi), %xmm2 3716; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3717; SSE-NEXT: pand %xmm9, %xmm2 3718; SSE-NEXT: por %xmm1, %xmm2 3719; SSE-NEXT: movdqa %xmm2, %xmm1 3720; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] 3721; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] 3722; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 3723; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] 3724; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 3725; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] 3726; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] 3727; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] 3728; SSE-NEXT: packuswb %xmm1, %xmm2 3729; SSE-NEXT: movdqa 16(%rdi), %xmm14 3730; SSE-NEXT: movdqa %xmm8, %xmm1 3731; SSE-NEXT: pandn %xmm14, %xmm1 3732; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3733; SSE-NEXT: movdqa (%rdi), %xmm4 3734; SSE-NEXT: movdqa %xmm4, %xmm3 3735; SSE-NEXT: movdqa %xmm4, %xmm9 3736; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3737; SSE-NEXT: pand %xmm8, %xmm3 3738; SSE-NEXT: por %xmm1, %xmm3 3739; SSE-NEXT: movdqa %xmm3, %xmm1 3740; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] 3741; SSE-NEXT: movdqa %xmm15, %xmm4 3742; SSE-NEXT: pandn %xmm1, %xmm4 3743; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] 3744; SSE-NEXT: pand %xmm15, %xmm3 3745; SSE-NEXT: por %xmm4, %xmm3 3746; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3] 3747; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] 3748; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] 3749; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] 3750; SSE-NEXT: packuswb %xmm1, %xmm1 3751; SSE-NEXT: pand %xmm0, %xmm1 3752; SSE-NEXT: pandn %xmm2, %xmm0 3753; SSE-NEXT: por %xmm0, %xmm1 3754; SSE-NEXT: movdqa 64(%rdi), %xmm15 3755; SSE-NEXT: movdqa %xmm7, %xmm0 3756; SSE-NEXT: pandn %xmm15, %xmm0 3757; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3758; SSE-NEXT: movdqa 80(%rdi), %xmm8 3759; SSE-NEXT: movdqa %xmm8, %xmm2 3760; SSE-NEXT: pand %xmm7, %xmm2 3761; SSE-NEXT: por %xmm0, %xmm2 3762; SSE-NEXT: movdqa %xmm2, %xmm0 3763; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] 3764; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] 3765; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] 3766; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 3767; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] 3768; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 3769; SSE-NEXT: movdqa 96(%rdi), %xmm2 3770; SSE-NEXT: movdqa %xmm2, %xmm3 3771; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] 3772; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3773; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] 3774; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3775; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 3776; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 3777; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] 3778; SSE-NEXT: packuswb %xmm2, %xmm2 3779; SSE-NEXT: movdqa %xmm6, %xmm3 3780; SSE-NEXT: pandn %xmm2, %xmm3 3781; SSE-NEXT: packuswb %xmm0, %xmm0 3782; SSE-NEXT: pand %xmm6, %xmm0 3783; SSE-NEXT: por %xmm0, %xmm3 3784; SSE-NEXT: pand %xmm5, %xmm1 3785; SSE-NEXT: pandn %xmm3, %xmm5 3786; SSE-NEXT: por %xmm1, %xmm5 3787; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3788; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,0,65535,65535] 3789; SSE-NEXT: movdqa %xmm2, %xmm0 3790; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 3791; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3792; SSE-NEXT: pand %xmm2, %xmm1 3793; SSE-NEXT: por %xmm0, %xmm1 3794; SSE-NEXT: movdqa %xmm1, %xmm2 3795; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] 3796; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] 3797; SSE-NEXT: movdqa %xmm0, %xmm3 3798; SSE-NEXT: pandn %xmm2, %xmm3 3799; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] 3800; SSE-NEXT: pand %xmm0, %xmm1 3801; SSE-NEXT: por %xmm3, %xmm1 3802; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] 3803; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] 3804; SSE-NEXT: psrld $16, %xmm2 3805; SSE-NEXT: packuswb %xmm2, %xmm1 3806; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] 3807; SSE-NEXT: movdqa %xmm5, %xmm2 3808; SSE-NEXT: pandn %xmm1, %xmm2 3809; SSE-NEXT: movdqa %xmm7, %xmm1 3810; SSE-NEXT: pandn %xmm11, %xmm1 3811; SSE-NEXT: movdqa %xmm12, %xmm3 3812; SSE-NEXT: pand %xmm7, %xmm3 3813; SSE-NEXT: movdqa %xmm7, %xmm12 3814; SSE-NEXT: por %xmm1, %xmm3 3815; SSE-NEXT: movdqa %xmm3, %xmm1 3816; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] 3817; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,0,65535] 3818; SSE-NEXT: movdqa %xmm7, %xmm4 3819; SSE-NEXT: pandn %xmm1, %xmm4 3820; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] 3821; SSE-NEXT: pand %xmm7, %xmm3 3822; SSE-NEXT: por %xmm4, %xmm3 3823; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] 3824; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 3825; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 3826; SSE-NEXT: packuswb %xmm1, %xmm1 3827; SSE-NEXT: pand %xmm5, %xmm1 3828; SSE-NEXT: movdqa %xmm5, %xmm7 3829; SSE-NEXT: por %xmm2, %xmm1 3830; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,65535,0,65535] 3831; SSE-NEXT: movdqa %xmm4, %xmm2 3832; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 3833; SSE-NEXT: movdqa %xmm13, %xmm3 3834; SSE-NEXT: pand %xmm4, %xmm3 3835; SSE-NEXT: movdqa %xmm4, %xmm13 3836; SSE-NEXT: por %xmm2, %xmm3 3837; SSE-NEXT: movdqa %xmm3, %xmm2 3838; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] 3839; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,0,65535,0,65535] 3840; SSE-NEXT: movdqa %xmm11, %xmm4 3841; SSE-NEXT: pandn %xmm2, %xmm4 3842; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] 3843; SSE-NEXT: pand %xmm11, %xmm3 3844; SSE-NEXT: por %xmm4, %xmm3 3845; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 3846; SSE-NEXT: pslld $16, %xmm2 3847; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 3848; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] 3849; SSE-NEXT: packuswb %xmm4, %xmm2 3850; SSE-NEXT: movdqa %xmm6, %xmm4 3851; SSE-NEXT: pandn %xmm2, %xmm4 3852; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,3,2,3] 3853; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] 3854; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] 3855; SSE-NEXT: packuswb %xmm2, %xmm2 3856; SSE-NEXT: pand %xmm6, %xmm2 3857; SSE-NEXT: por %xmm2, %xmm4 3858; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] 3859; SSE-NEXT: movdqa %xmm3, %xmm2 3860; SSE-NEXT: pandn %xmm4, %xmm2 3861; SSE-NEXT: pand %xmm3, %xmm1 3862; SSE-NEXT: por %xmm1, %xmm2 3863; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3864; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] 3865; SSE-NEXT: movdqa %xmm5, %xmm1 3866; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 3867; SSE-NEXT: pandn %xmm6, %xmm1 3868; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 3869; SSE-NEXT: movdqa %xmm10, %xmm2 3870; SSE-NEXT: pand %xmm5, %xmm2 3871; SSE-NEXT: por %xmm1, %xmm2 3872; SSE-NEXT: movdqa %xmm2, %xmm1 3873; SSE-NEXT: pxor %xmm3, %xmm3 3874; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] 3875; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 3876; SSE-NEXT: pxor %xmm5, %xmm5 3877; SSE-NEXT: pand %xmm0, %xmm2 3878; SSE-NEXT: pandn %xmm1, %xmm0 3879; SSE-NEXT: por %xmm2, %xmm0 3880; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] 3881; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,6] 3882; SSE-NEXT: psrld $16, %xmm1 3883; SSE-NEXT: packuswb %xmm1, %xmm0 3884; SSE-NEXT: movdqa %xmm7, %xmm4 3885; SSE-NEXT: movdqa %xmm7, %xmm1 3886; SSE-NEXT: pandn %xmm0, %xmm1 3887; SSE-NEXT: movdqa %xmm12, %xmm0 3888; SSE-NEXT: pandn %xmm14, %xmm0 3889; SSE-NEXT: movdqa %xmm9, %xmm2 3890; SSE-NEXT: pand %xmm12, %xmm2 3891; SSE-NEXT: por %xmm0, %xmm2 3892; SSE-NEXT: movdqa %xmm2, %xmm0 3893; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] 3894; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,0,65535] 3895; SSE-NEXT: movdqa %xmm7, %xmm3 3896; SSE-NEXT: pandn %xmm0, %xmm3 3897; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] 3898; SSE-NEXT: pand %xmm7, %xmm2 3899; SSE-NEXT: por %xmm3, %xmm2 3900; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3] 3901; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 3902; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] 3903; SSE-NEXT: packuswb %xmm0, %xmm0 3904; SSE-NEXT: pand %xmm4, %xmm0 3905; SSE-NEXT: por %xmm1, %xmm0 3906; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3907; SSE-NEXT: movdqa %xmm13, %xmm0 3908; SSE-NEXT: pandn %xmm8, %xmm0 3909; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3910; SSE-NEXT: movdqa %xmm15, %xmm1 3911; SSE-NEXT: pand %xmm13, %xmm1 3912; SSE-NEXT: por %xmm0, %xmm1 3913; SSE-NEXT: movdqa %xmm1, %xmm0 3914; SSE-NEXT: pxor %xmm2, %xmm2 3915; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 3916; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 3917; SSE-NEXT: movdqa %xmm11, %xmm2 3918; SSE-NEXT: pand %xmm11, %xmm1 3919; SSE-NEXT: pandn %xmm0, %xmm2 3920; SSE-NEXT: por %xmm1, %xmm2 3921; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3922; SSE-NEXT: movdqa %xmm13, %xmm0 3923; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3924; SSE-NEXT: pandn %xmm1, %xmm0 3925; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 3926; SSE-NEXT: movdqa %xmm5, %xmm9 3927; SSE-NEXT: pand %xmm13, %xmm9 3928; SSE-NEXT: por %xmm0, %xmm9 3929; SSE-NEXT: movdqa %xmm6, %xmm0 3930; SSE-NEXT: pand %xmm13, %xmm0 3931; SSE-NEXT: pandn %xmm10, %xmm13 3932; SSE-NEXT: por %xmm0, %xmm13 3933; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3934; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,65535] 3935; SSE-NEXT: movdqa %xmm2, %xmm0 3936; SSE-NEXT: pandn %xmm5, %xmm0 3937; SSE-NEXT: movdqa %xmm12, %xmm7 3938; SSE-NEXT: movdqa %xmm12, %xmm5 3939; SSE-NEXT: pandn %xmm1, %xmm5 3940; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3941; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,2,2,3] 3942; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3943; SSE-NEXT: pand %xmm2, %xmm1 3944; SSE-NEXT: por %xmm0, %xmm1 3945; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3946; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3947; SSE-NEXT: pand %xmm2, %xmm13 3948; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 3949; SSE-NEXT: pand %xmm2, %xmm12 3950; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 3951; SSE-NEXT: pand %xmm2, %xmm14 3952; SSE-NEXT: pand %xmm2, %xmm8 3953; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill 3954; SSE-NEXT: movdqa %xmm7, %xmm1 3955; SSE-NEXT: pandn %xmm10, %xmm1 3956; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3957; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,2,2,3] 3958; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3959; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3960; SSE-NEXT: pand %xmm2, %xmm10 3961; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3962; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3963; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3964; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3965; SSE-NEXT: pandn %xmm6, %xmm2 3966; SSE-NEXT: por %xmm10, %xmm2 3967; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3968; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535] 3969; SSE-NEXT: movdqa %xmm7, %xmm1 3970; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 3971; SSE-NEXT: pandn %xmm2, %xmm1 3972; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3973; SSE-NEXT: movdqa %xmm2, %xmm5 3974; SSE-NEXT: movdqa %xmm2, %xmm3 3975; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 3976; SSE-NEXT: movdqa %xmm6, %xmm8 3977; SSE-NEXT: pslld $16, %xmm8 3978; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 3979; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3980; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] 3981; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3982; SSE-NEXT: movdqa %xmm1, %xmm15 3983; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 3984; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] 3985; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 3986; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3987; SSE-NEXT: movdqa %xmm0, %xmm3 3988; SSE-NEXT: movdqa %xmm1, %xmm0 3989; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] 3990; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3991; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3992; SSE-NEXT: pxor %xmm10, %xmm10 3993; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] 3994; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,7] 3995; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,0] 3996; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,6,4,6,5] 3997; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,0,65535] 3998; SSE-NEXT: pand %xmm4, %xmm0 3999; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4000; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4001; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4002; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] 4003; SSE-NEXT: pxor %xmm9, %xmm9 4004; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,7,5,6,7] 4005; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] 4006; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,4,6,5] 4007; SSE-NEXT: movdqa %xmm4, %xmm0 4008; SSE-NEXT: pand %xmm4, %xmm10 4009; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4010; SSE-NEXT: pandn %xmm3, %xmm4 4011; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4012; SSE-NEXT: pand %xmm0, %xmm2 4013; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4014; SSE-NEXT: movdqa %xmm1, %xmm4 4015; SSE-NEXT: pand %xmm0, %xmm4 4016; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4017; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4018; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4019; SSE-NEXT: movdqa %xmm6, %xmm4 4020; SSE-NEXT: pandn %xmm6, %xmm0 4021; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4022; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4023; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4024; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 4025; SSE-NEXT: pand %xmm7, %xmm0 4026; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4027; SSE-NEXT: pand %xmm7, %xmm3 4028; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4029; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 4030; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4031; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] 4032; SSE-NEXT: pand %xmm7, %xmm6 4033; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4034; SSE-NEXT: pand %xmm7, %xmm4 4035; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4036; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4037; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4038; SSE-NEXT: pandn %xmm1, %xmm7 4039; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4040; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] 4041; SSE-NEXT: packuswb %xmm1, %xmm1 4042; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,65535,0] 4043; SSE-NEXT: movdqa %xmm0, %xmm10 4044; SSE-NEXT: pandn %xmm1, %xmm10 4045; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 4046; SSE-NEXT: # xmm1 = mem[0,3,2,3] 4047; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7] 4048; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] 4049; SSE-NEXT: packuswb %xmm1, %xmm1 4050; SSE-NEXT: pand %xmm0, %xmm1 4051; SSE-NEXT: movdqa %xmm0, %xmm2 4052; SSE-NEXT: por %xmm1, %xmm10 4053; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] 4054; SSE-NEXT: movdqa %xmm0, %xmm3 4055; SSE-NEXT: pandn %xmm10, %xmm3 4056; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4057; SSE-NEXT: pand %xmm0, %xmm1 4058; SSE-NEXT: movdqa %xmm0, %xmm8 4059; SSE-NEXT: por %xmm1, %xmm3 4060; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4061; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,0,65535] 4062; SSE-NEXT: movdqa %xmm0, %xmm1 4063; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 4064; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 4065; SSE-NEXT: pand %xmm0, %xmm10 4066; SSE-NEXT: por %xmm1, %xmm10 4067; SSE-NEXT: movdqa %xmm10, %xmm1 4068; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] 4069; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] 4070; SSE-NEXT: movdqa %xmm3, %xmm0 4071; SSE-NEXT: pandn %xmm1, %xmm0 4072; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] 4073; SSE-NEXT: pand %xmm3, %xmm10 4074; SSE-NEXT: por %xmm0, %xmm10 4075; SSE-NEXT: packuswb %xmm5, %xmm0 4076; SSE-NEXT: movdqa %xmm2, %xmm1 4077; SSE-NEXT: pandn %xmm0, %xmm1 4078; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,3,2,3] 4079; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] 4080; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] 4081; SSE-NEXT: packuswb %xmm0, %xmm0 4082; SSE-NEXT: pand %xmm2, %xmm0 4083; SSE-NEXT: por %xmm0, %xmm1 4084; SSE-NEXT: movdqa %xmm8, %xmm0 4085; SSE-NEXT: pandn %xmm1, %xmm0 4086; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4087; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 4088; SSE-NEXT: por %xmm1, %xmm13 4089; SSE-NEXT: movdqa %xmm13, %xmm1 4090; SSE-NEXT: pxor %xmm6, %xmm6 4091; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] 4092; SSE-NEXT: movdqa %xmm3, %xmm2 4093; SSE-NEXT: pandn %xmm1, %xmm2 4094; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] 4095; SSE-NEXT: pand %xmm3, %xmm13 4096; SSE-NEXT: movdqa %xmm3, %xmm5 4097; SSE-NEXT: por %xmm2, %xmm13 4098; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 4099; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,3,2,3] 4100; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] 4101; SSE-NEXT: movdqa %xmm11, %xmm2 4102; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] 4103; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] 4104; SSE-NEXT: movdqa %xmm1, %xmm10 4105; SSE-NEXT: pandn %xmm2, %xmm10 4106; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] 4107; SSE-NEXT: pand %xmm1, %xmm11 4108; SSE-NEXT: por %xmm10, %xmm11 4109; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,1,2,1] 4110; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] 4111; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] 4112; SSE-NEXT: packuswb %xmm2, %xmm3 4113; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] 4114; SSE-NEXT: movdqa %xmm6, %xmm4 4115; SSE-NEXT: pandn %xmm3, %xmm4 4116; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,2,1,3] 4117; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] 4118; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,3] 4119; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] 4120; SSE-NEXT: packuswb %xmm2, %xmm2 4121; SSE-NEXT: pand %xmm6, %xmm2 4122; SSE-NEXT: movdqa %xmm6, %xmm13 4123; SSE-NEXT: por %xmm2, %xmm4 4124; SSE-NEXT: pand %xmm8, %xmm4 4125; SSE-NEXT: por %xmm0, %xmm4 4126; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4127; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,0,65535] 4128; SSE-NEXT: movdqa %xmm10, %xmm0 4129; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4130; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4131; SSE-NEXT: pand %xmm10, %xmm2 4132; SSE-NEXT: por %xmm0, %xmm2 4133; SSE-NEXT: movdqa %xmm2, %xmm0 4134; SSE-NEXT: pxor %xmm6, %xmm6 4135; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] 4136; SSE-NEXT: movdqa %xmm5, %xmm11 4137; SSE-NEXT: movdqa %xmm5, %xmm3 4138; SSE-NEXT: pandn %xmm0, %xmm3 4139; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] 4140; SSE-NEXT: pand %xmm5, %xmm2 4141; SSE-NEXT: por %xmm3, %xmm2 4142; SSE-NEXT: packuswb %xmm15, %xmm0 4143; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,65535,0] 4144; SSE-NEXT: movdqa %xmm4, %xmm3 4145; SSE-NEXT: pandn %xmm0, %xmm3 4146; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3] 4147; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] 4148; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] 4149; SSE-NEXT: packuswb %xmm0, %xmm0 4150; SSE-NEXT: pand %xmm4, %xmm0 4151; SSE-NEXT: por %xmm0, %xmm3 4152; SSE-NEXT: movdqa %xmm8, %xmm0 4153; SSE-NEXT: movdqa %xmm8, %xmm15 4154; SSE-NEXT: pandn %xmm3, %xmm0 4155; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 4156; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4157; SSE-NEXT: pandn %xmm5, %xmm2 4158; SSE-NEXT: por %xmm2, %xmm12 4159; SSE-NEXT: movdqa %xmm12, %xmm2 4160; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] 4161; SSE-NEXT: movdqa %xmm11, %xmm3 4162; SSE-NEXT: pandn %xmm2, %xmm3 4163; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] 4164; SSE-NEXT: pand %xmm11, %xmm12 4165; SSE-NEXT: por %xmm3, %xmm12 4166; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 4167; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,3,2,3] 4168; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 4169; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 4170; SSE-NEXT: movdqa %xmm3, %xmm2 4171; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] 4172; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] 4173; SSE-NEXT: pand %xmm1, %xmm3 4174; SSE-NEXT: pandn %xmm2, %xmm1 4175; SSE-NEXT: por %xmm3, %xmm1 4176; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] 4177; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] 4178; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] 4179; SSE-NEXT: packuswb %xmm2, %xmm1 4180; SSE-NEXT: movdqa %xmm13, %xmm2 4181; SSE-NEXT: pandn %xmm1, %xmm2 4182; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,2,1,3] 4183; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] 4184; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] 4185; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] 4186; SSE-NEXT: packuswb %xmm1, %xmm1 4187; SSE-NEXT: pand %xmm13, %xmm1 4188; SSE-NEXT: por %xmm1, %xmm2 4189; SSE-NEXT: pand %xmm15, %xmm2 4190; SSE-NEXT: por %xmm0, %xmm2 4191; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4192; SSE-NEXT: movdqa %xmm10, %xmm0 4193; SSE-NEXT: pandn %xmm7, %xmm0 4194; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4195; SSE-NEXT: pand %xmm10, %xmm2 4196; SSE-NEXT: por %xmm0, %xmm2 4197; SSE-NEXT: movdqa %xmm2, %xmm0 4198; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] 4199; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 4200; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 4201; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] 4202; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 4203; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] 4204; SSE-NEXT: psrlq $48, %xmm0 4205; SSE-NEXT: packuswb %xmm0, %xmm1 4206; SSE-NEXT: movdqa %xmm13, %xmm0 4207; SSE-NEXT: pandn %xmm1, %xmm0 4208; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,0,65535,65535] 4209; SSE-NEXT: movdqa %xmm3, %xmm1 4210; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 4211; SSE-NEXT: pandn %xmm9, %xmm1 4212; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 4213; SSE-NEXT: movdqa %xmm7, %xmm2 4214; SSE-NEXT: pand %xmm3, %xmm2 4215; SSE-NEXT: por %xmm1, %xmm2 4216; SSE-NEXT: movdqa %xmm2, %xmm1 4217; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] 4218; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,0,65535,65535,65535,65535] 4219; SSE-NEXT: movdqa %xmm4, %xmm3 4220; SSE-NEXT: pandn %xmm1, %xmm3 4221; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] 4222; SSE-NEXT: pand %xmm4, %xmm2 4223; SSE-NEXT: por %xmm3, %xmm2 4224; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,2,1,0,4,5,6,7] 4225; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] 4226; SSE-NEXT: packuswb %xmm1, %xmm1 4227; SSE-NEXT: pand %xmm13, %xmm1 4228; SSE-NEXT: por %xmm0, %xmm1 4229; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 4230; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4231; SSE-NEXT: pandn %xmm12, %xmm0 4232; SSE-NEXT: por %xmm0, %xmm14 4233; SSE-NEXT: movdqa %xmm14, %xmm0 4234; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] 4235; SSE-NEXT: movdqa %xmm11, %xmm2 4236; SSE-NEXT: pandn %xmm0, %xmm2 4237; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] 4238; SSE-NEXT: pand %xmm11, %xmm14 4239; SSE-NEXT: por %xmm2, %xmm14 4240; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4241; SSE-NEXT: # xmm0 = mem[2,1,2,3,4,5,6,7] 4242; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 4243; SSE-NEXT: packuswb %xmm0, %xmm0 4244; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0] 4245; SSE-NEXT: movdqa %xmm10, %xmm2 4246; SSE-NEXT: pandn %xmm0, %xmm2 4247; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,2,1,0,4,5,6,7] 4248; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] 4249; SSE-NEXT: packuswb %xmm0, %xmm0 4250; SSE-NEXT: pand %xmm10, %xmm0 4251; SSE-NEXT: por %xmm0, %xmm2 4252; SSE-NEXT: movdqa %xmm15, %xmm0 4253; SSE-NEXT: pandn %xmm2, %xmm0 4254; SSE-NEXT: pand %xmm15, %xmm1 4255; SSE-NEXT: por %xmm1, %xmm0 4256; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4257; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,65535,65535,0,65535] 4258; SSE-NEXT: movdqa %xmm11, %xmm0 4259; SSE-NEXT: pandn %xmm8, %xmm0 4260; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4261; SSE-NEXT: pand %xmm11, %xmm1 4262; SSE-NEXT: por %xmm0, %xmm1 4263; SSE-NEXT: movdqa %xmm1, %xmm0 4264; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] 4265; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] 4266; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 4267; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] 4268; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 4269; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] 4270; SSE-NEXT: psrlq $48, %xmm0 4271; SSE-NEXT: packuswb %xmm0, %xmm1 4272; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,0,65535,65535] 4273; SSE-NEXT: movdqa %xmm8, %xmm0 4274; SSE-NEXT: movdqa %xmm5, %xmm11 4275; SSE-NEXT: pandn %xmm5, %xmm0 4276; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 4277; SSE-NEXT: movdqa %xmm3, %xmm2 4278; SSE-NEXT: pand %xmm8, %xmm2 4279; SSE-NEXT: por %xmm0, %xmm2 4280; SSE-NEXT: movdqa %xmm2, %xmm0 4281; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] 4282; SSE-NEXT: movdqa %xmm4, %xmm5 4283; SSE-NEXT: pandn %xmm0, %xmm5 4284; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] 4285; SSE-NEXT: pand %xmm4, %xmm2 4286; SSE-NEXT: por %xmm5, %xmm2 4287; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,2,1,0,4,5,6,7] 4288; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] 4289; SSE-NEXT: packuswb %xmm0, %xmm0 4290; SSE-NEXT: pand %xmm13, %xmm0 4291; SSE-NEXT: pandn %xmm1, %xmm13 4292; SSE-NEXT: por %xmm13, %xmm0 4293; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 4294; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4295; SSE-NEXT: pandn %xmm8, %xmm1 4296; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload 4297; SSE-NEXT: por %xmm1, %xmm5 4298; SSE-NEXT: movdqa %xmm5, %xmm1 4299; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] 4300; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] 4301; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,0,65535] 4302; SSE-NEXT: pand %xmm2, %xmm5 4303; SSE-NEXT: pandn %xmm1, %xmm2 4304; SSE-NEXT: por %xmm5, %xmm2 4305; SSE-NEXT: movdqa %xmm2, %xmm5 4306; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 4307; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,1,2,3,4,5,6,7] 4308; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 4309; SSE-NEXT: packuswb %xmm1, %xmm1 4310; SSE-NEXT: movdqa %xmm10, %xmm2 4311; SSE-NEXT: pandn %xmm1, %xmm2 4312; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,1,0,4,5,6,7] 4313; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7] 4314; SSE-NEXT: packuswb %xmm1, %xmm1 4315; SSE-NEXT: pand %xmm10, %xmm1 4316; SSE-NEXT: por %xmm1, %xmm2 4317; SSE-NEXT: movdqa %xmm15, %xmm1 4318; SSE-NEXT: pandn %xmm2, %xmm1 4319; SSE-NEXT: pand %xmm15, %xmm0 4320; SSE-NEXT: movdqa %xmm15, %xmm14 4321; SSE-NEXT: por %xmm0, %xmm1 4322; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill 4323; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,0,65535,65535] 4324; SSE-NEXT: movdqa %xmm15, %xmm0 4325; SSE-NEXT: pandn %xmm9, %xmm0 4326; SSE-NEXT: movdqa %xmm7, %xmm2 4327; SSE-NEXT: pand %xmm15, %xmm2 4328; SSE-NEXT: por %xmm0, %xmm2 4329; SSE-NEXT: movdqa %xmm2, %xmm0 4330; SSE-NEXT: pxor %xmm1, %xmm1 4331; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 4332; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 4333; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 4334; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] 4335; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] 4336; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 4337; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4338; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 4339; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 4340; SSE-NEXT: pandn %xmm0, %xmm6 4341; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 4342; SSE-NEXT: por %xmm6, %xmm5 4343; SSE-NEXT: packuswb %xmm0, %xmm5 4344; SSE-NEXT: packuswb %xmm2, %xmm2 4345; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,3,3] 4346; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] 4347; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,65535,65535,0,65535,65535] 4348; SSE-NEXT: movdqa %xmm9, %xmm2 4349; SSE-NEXT: pandn %xmm12, %xmm2 4350; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 4351; SSE-NEXT: movdqa %xmm7, %xmm5 4352; SSE-NEXT: pand %xmm9, %xmm5 4353; SSE-NEXT: por %xmm2, %xmm5 4354; SSE-NEXT: movdqa %xmm5, %xmm2 4355; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 4356; SSE-NEXT: movdqa %xmm4, %xmm6 4357; SSE-NEXT: pandn %xmm2, %xmm6 4358; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] 4359; SSE-NEXT: pand %xmm4, %xmm5 4360; SSE-NEXT: por %xmm6, %xmm5 4361; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 4362; SSE-NEXT: # xmm2 = mem[0,1,2,1] 4363; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] 4364; SSE-NEXT: packuswb %xmm2, %xmm2 4365; SSE-NEXT: movdqa %xmm10, %xmm6 4366; SSE-NEXT: pandn %xmm2, %xmm6 4367; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] 4368; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] 4369; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] 4370; SSE-NEXT: packuswb %xmm2, %xmm2 4371; SSE-NEXT: pand %xmm10, %xmm2 4372; SSE-NEXT: por %xmm2, %xmm6 4373; SSE-NEXT: movdqa %xmm14, %xmm1 4374; SSE-NEXT: pandn %xmm6, %xmm14 4375; SSE-NEXT: andps %xmm1, %xmm0 4376; SSE-NEXT: movdqa %xmm1, %xmm6 4377; SSE-NEXT: por %xmm0, %xmm14 4378; SSE-NEXT: movdqa %xmm15, %xmm1 4379; SSE-NEXT: movdqa %xmm15, %xmm0 4380; SSE-NEXT: movdqa %xmm11, %xmm15 4381; SSE-NEXT: pandn %xmm11, %xmm0 4382; SSE-NEXT: pand %xmm1, %xmm3 4383; SSE-NEXT: por %xmm0, %xmm3 4384; SSE-NEXT: movdqa %xmm3, %xmm0 4385; SSE-NEXT: pxor %xmm1, %xmm1 4386; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 4387; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 4388; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 4389; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,1,2,3] 4390; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] 4391; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] 4392; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4393; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 4394; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 4395; SSE-NEXT: pandn %xmm0, %xmm3 4396; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4397; SSE-NEXT: por %xmm3, %xmm2 4398; SSE-NEXT: packuswb %xmm0, %xmm2 4399; SSE-NEXT: packuswb %xmm5, %xmm5 4400; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,3,3] 4401; SSE-NEXT: movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3] 4402; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4403; SSE-NEXT: movdqa %xmm2, %xmm5 4404; SSE-NEXT: movdqa %xmm2, %xmm3 4405; SSE-NEXT: movdqa %xmm9, %xmm2 4406; SSE-NEXT: pand %xmm9, %xmm5 4407; SSE-NEXT: pandn %xmm8, %xmm2 4408; SSE-NEXT: movdqa %xmm8, %xmm9 4409; SSE-NEXT: por %xmm5, %xmm2 4410; SSE-NEXT: movdqa %xmm2, %xmm5 4411; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 4412; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 4413; SSE-NEXT: pand %xmm4, %xmm2 4414; SSE-NEXT: pandn %xmm5, %xmm4 4415; SSE-NEXT: por %xmm2, %xmm4 4416; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] 4417; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] 4418; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] 4419; SSE-NEXT: packuswb %xmm4, %xmm4 4420; SSE-NEXT: pand %xmm10, %xmm4 4421; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,1,2,1] 4422; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] 4423; SSE-NEXT: packuswb %xmm5, %xmm5 4424; SSE-NEXT: pandn %xmm5, %xmm10 4425; SSE-NEXT: por %xmm4, %xmm10 4426; SSE-NEXT: movdqa %xmm6, %xmm4 4427; SSE-NEXT: pandn %xmm10, %xmm4 4428; SSE-NEXT: andps %xmm6, %xmm0 4429; SSE-NEXT: por %xmm0, %xmm4 4430; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4431; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,0,65535] 4432; SSE-NEXT: pand %xmm2, %xmm0 4433; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4434; SSE-NEXT: movdqa %xmm0, %xmm5 4435; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] 4436; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,65535,65535,65535,65535,65535,65535,0] 4437; SSE-NEXT: movdqa %xmm10, %xmm6 4438; SSE-NEXT: pandn %xmm5, %xmm6 4439; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 4440; SSE-NEXT: pand %xmm10, %xmm0 4441; SSE-NEXT: por %xmm6, %xmm0 4442; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 4443; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,5,4,7,6] 4444; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4445; SSE-NEXT: packuswb %xmm5, %xmm8 4446; SSE-NEXT: movdqa %xmm2, %xmm11 4447; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4448; SSE-NEXT: pandn %xmm0, %xmm11 4449; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] 4450; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 4451; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,2,2,3] 4452; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 4453; SSE-NEXT: movdqa %xmm5, %xmm0 4454; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 4455; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] 4456; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 4457; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 4458; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] 4459; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] 4460; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] 4461; SSE-NEXT: packuswb %xmm5, %xmm5 4462; SSE-NEXT: movss {{.*#+}} xmm8 = xmm5[0],xmm8[1,2,3] 4463; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] 4464; SSE-NEXT: movdqa %xmm6, %xmm0 4465; SSE-NEXT: pandn %xmm12, %xmm0 4466; SSE-NEXT: movdqa %xmm7, %xmm5 4467; SSE-NEXT: pand %xmm6, %xmm5 4468; SSE-NEXT: por %xmm0, %xmm5 4469; SSE-NEXT: movdqa %xmm5, %xmm0 4470; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 4471; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,0,65535,65535,65535] 4472; SSE-NEXT: movdqa %xmm6, %xmm7 4473; SSE-NEXT: pandn %xmm0, %xmm7 4474; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] 4475; SSE-NEXT: pand %xmm6, %xmm5 4476; SSE-NEXT: por %xmm7, %xmm5 4477; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4478; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4479; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 4480; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,6] 4481; SSE-NEXT: packuswb %xmm0, %xmm0 4482; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] 4483; SSE-NEXT: movdqa %xmm7, %xmm12 4484; SSE-NEXT: pandn %xmm0, %xmm12 4485; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,1,0,3] 4486; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] 4487; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] 4488; SSE-NEXT: packuswb %xmm0, %xmm0 4489; SSE-NEXT: pand %xmm7, %xmm0 4490; SSE-NEXT: por %xmm0, %xmm12 4491; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] 4492; SSE-NEXT: movdqa %xmm0, %xmm5 4493; SSE-NEXT: pandn %xmm12, %xmm5 4494; SSE-NEXT: andps %xmm0, %xmm8 4495; SSE-NEXT: por %xmm8, %xmm5 4496; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4497; SSE-NEXT: pand %xmm2, %xmm0 4498; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4499; SSE-NEXT: movdqa %xmm0, %xmm12 4500; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15] 4501; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 4502; SSE-NEXT: pand %xmm10, %xmm0 4503; SSE-NEXT: pandn %xmm12, %xmm10 4504; SSE-NEXT: por %xmm0, %xmm10 4505; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,1,0,3] 4506; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,5,4,7,6] 4507; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4508; SSE-NEXT: packuswb %xmm12, %xmm8 4509; SSE-NEXT: movdqa %xmm13, %xmm12 4510; SSE-NEXT: pand %xmm2, %xmm12 4511; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 4512; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,2,2,3] 4513; SSE-NEXT: pand %xmm2, %xmm10 4514; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4515; SSE-NEXT: pandn %xmm15, %xmm2 4516; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm15[1,3,2,3] 4517; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] 4518; SSE-NEXT: movdqa %xmm0, %xmm10 4519; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] 4520; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] 4521; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7] 4522; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 4523; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] 4524; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 4525; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] 4526; SSE-NEXT: packuswb %xmm0, %xmm0 4527; SSE-NEXT: movss {{.*#+}} xmm8 = xmm0[0],xmm8[1,2,3] 4528; SSE-NEXT: movdqa %xmm3, %xmm13 4529; SSE-NEXT: movdqa %xmm3, %xmm0 4530; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] 4531; SSE-NEXT: pand %xmm3, %xmm0 4532; SSE-NEXT: pandn %xmm9, %xmm3 4533; SSE-NEXT: movdqa %xmm9, %xmm15 4534; SSE-NEXT: por %xmm0, %xmm3 4535; SSE-NEXT: movdqa %xmm3, %xmm0 4536; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 4537; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] 4538; SSE-NEXT: pand %xmm6, %xmm3 4539; SSE-NEXT: pandn %xmm0, %xmm6 4540; SSE-NEXT: por %xmm3, %xmm6 4541; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4542; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4543; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 4544; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,6] 4545; SSE-NEXT: packuswb %xmm0, %xmm0 4546; SSE-NEXT: movdqa %xmm7, %xmm9 4547; SSE-NEXT: pandn %xmm0, %xmm9 4548; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,1,0,3] 4549; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] 4550; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] 4551; SSE-NEXT: packuswb %xmm0, %xmm0 4552; SSE-NEXT: pand %xmm7, %xmm0 4553; SSE-NEXT: por %xmm0, %xmm9 4554; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] 4555; SSE-NEXT: movdqa %xmm3, %xmm6 4556; SSE-NEXT: pandn %xmm9, %xmm6 4557; SSE-NEXT: andps %xmm3, %xmm8 4558; SSE-NEXT: por %xmm8, %xmm6 4559; SSE-NEXT: movdqa %xmm12, %xmm1 4560; SSE-NEXT: por %xmm11, %xmm1 4561; SSE-NEXT: movdqa %xmm1, %xmm0 4562; SSE-NEXT: pxor %xmm9, %xmm9 4563; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 4564; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] 4565; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] 4566; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,1,2,1] 4567; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] 4568; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] 4569; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 4570; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] 4571; SSE-NEXT: pxor %xmm1, %xmm1 4572; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 4573; SSE-NEXT: pandn %xmm8, %xmm10 4574; SSE-NEXT: movdqa %xmm8, %xmm9 4575; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 4576; SSE-NEXT: por %xmm10, %xmm8 4577; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,1,3] 4578; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7] 4579; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] 4580; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,7,6,5] 4581; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[3,3,3,3] 4582; SSE-NEXT: packuswb %xmm8, %xmm10 4583; SSE-NEXT: packuswb %xmm0, %xmm0 4584; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] 4585; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4586; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4587; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 4588; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] 4589; SSE-NEXT: packuswb %xmm0, %xmm0 4590; SSE-NEXT: movdqa %xmm7, %xmm8 4591; SSE-NEXT: pandn %xmm0, %xmm8 4592; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 4593; SSE-NEXT: # xmm0 = mem[1,3,2,3] 4594; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 4595; SSE-NEXT: # xmm11 = mem[0,2,2,3] 4596; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] 4597; SSE-NEXT: movdqa %xmm11, %xmm0 4598; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 4599; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,0,65535,65535] 4600; SSE-NEXT: movdqa %xmm9, %xmm12 4601; SSE-NEXT: pandn %xmm0, %xmm12 4602; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15] 4603; SSE-NEXT: pand %xmm9, %xmm11 4604; SSE-NEXT: por %xmm12, %xmm11 4605; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,1,1,1] 4606; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] 4607; SSE-NEXT: packuswb %xmm0, %xmm0 4608; SSE-NEXT: pand %xmm7, %xmm0 4609; SSE-NEXT: por %xmm8, %xmm0 4610; SSE-NEXT: movaps %xmm3, %xmm1 4611; SSE-NEXT: movdqa %xmm3, %xmm8 4612; SSE-NEXT: pandn %xmm0, %xmm8 4613; SSE-NEXT: andps %xmm3, %xmm10 4614; SSE-NEXT: por %xmm10, %xmm8 4615; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 4616; SSE-NEXT: movdqa %xmm2, %xmm0 4617; SSE-NEXT: pxor %xmm11, %xmm11 4618; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] 4619; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] 4620; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm11[8],xmm2[9],xmm11[9],xmm2[10],xmm11[10],xmm2[11],xmm11[11],xmm2[12],xmm11[12],xmm2[13],xmm11[13],xmm2[14],xmm11[14],xmm2[15],xmm11[15] 4621; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,1,2,1] 4622; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,7,6,7] 4623; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] 4624; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4625; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm11[8],xmm2[9],xmm11[9],xmm2[10],xmm11[10],xmm2[11],xmm11[11],xmm2[12],xmm11[12],xmm2[13],xmm11[13],xmm2[14],xmm11[14],xmm2[15],xmm11[15] 4626; SSE-NEXT: pxor %xmm12, %xmm12 4627; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 4628; SSE-NEXT: pandn %xmm2, %xmm10 4629; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 4630; SSE-NEXT: por %xmm10, %xmm3 4631; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,2,1,3] 4632; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7] 4633; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] 4634; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,5] 4635; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[3,3,3,3] 4636; SSE-NEXT: packuswb %xmm11, %xmm10 4637; SSE-NEXT: packuswb %xmm0, %xmm0 4638; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] 4639; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4640; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 4641; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,3,2,3] 4642; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm13[0,2,2,3] 4643; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] 4644; SSE-NEXT: movdqa %xmm11, %xmm0 4645; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] 4646; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] 4647; SSE-NEXT: pand %xmm9, %xmm11 4648; SSE-NEXT: pandn %xmm0, %xmm9 4649; SSE-NEXT: por %xmm11, %xmm9 4650; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,1,1,1] 4651; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] 4652; SSE-NEXT: packuswb %xmm0, %xmm0 4653; SSE-NEXT: pand %xmm7, %xmm0 4654; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,0,3] 4655; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] 4656; SSE-NEXT: packuswb %xmm9, %xmm9 4657; SSE-NEXT: pandn %xmm9, %xmm7 4658; SSE-NEXT: por %xmm7, %xmm0 4659; SSE-NEXT: andps %xmm1, %xmm10 4660; SSE-NEXT: andnps %xmm0, %xmm1 4661; SSE-NEXT: orps %xmm10, %xmm1 4662; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4663; SSE-NEXT: movaps %xmm0, (%rsi) 4664; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4665; SSE-NEXT: movaps %xmm0, 16(%rsi) 4666; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4667; SSE-NEXT: movaps %xmm0, (%rdx) 4668; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4669; SSE-NEXT: movaps %xmm0, 16(%rdx) 4670; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4671; SSE-NEXT: movaps %xmm0, (%rcx) 4672; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4673; SSE-NEXT: movaps %xmm0, 16(%rcx) 4674; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 4675; SSE-NEXT: movaps %xmm0, (%r8) 4676; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4677; SSE-NEXT: movaps %xmm0, 16(%r8) 4678; SSE-NEXT: movdqa %xmm4, (%r9) 4679; SSE-NEXT: movdqa %xmm14, 16(%r9) 4680; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 4681; SSE-NEXT: movdqa %xmm6, (%rax) 4682; SSE-NEXT: movdqa %xmm5, 16(%rax) 4683; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 4684; SSE-NEXT: movaps %xmm1, (%rax) 4685; SSE-NEXT: movdqa %xmm8, 16(%rax) 4686; SSE-NEXT: addq $648, %rsp # imm = 0x288 4687; SSE-NEXT: retq 4688; 4689; AVX-LABEL: load_i8_stride7_vf32: 4690; AVX: # %bb.0: 4691; AVX-NEXT: subq $200, %rsp 4692; AVX-NEXT: vmovdqa 176(%rdi), %xmm7 4693; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13,u,u,u,u] 4694; AVX-NEXT: vmovdqa 160(%rdi), %xmm6 4695; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm6[u,u,u,u] 4696; AVX-NEXT: vpor %xmm0, %xmm1, %xmm1 4697; AVX-NEXT: vmovdqa 144(%rdi), %xmm8 4698; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,u],zero,zero,xmm8[3,10,u,u,u,u,u,u,u,u,u] 4699; AVX-NEXT: vmovdqa 128(%rdi), %xmm9 4700; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,5,12],zero,zero,xmm9[u,u,u,u,u,u,u,u,u] 4701; AVX-NEXT: vpor %xmm0, %xmm2, %xmm2 4702; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,0,0,0,0,255,255,255,255,255,u,u,u,u] 4703; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 4704; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4705; AVX-NEXT: vmovdqa (%rdi), %xmm10 4706; AVX-NEXT: vmovdqa 16(%rdi), %xmm11 4707; AVX-NEXT: vmovdqa 32(%rdi), %xmm15 4708; AVX-NEXT: vmovdqa 48(%rdi), %xmm4 4709; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u,u,u,u,u] 4710; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] 4711; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 4712; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,3,10],zero,zero,zero,xmm15[u,u,u,u,u,u] 4713; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] 4714; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 4715; AVX-NEXT: vmovq {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] 4716; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 4717; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4718; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm6[u,u,u,u] 4719; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14,u,u,u,u] 4720; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 4721; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,zero,xmm8[4,11,u,u,u,u,u,u,u,u,u] 4722; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,6,13],zero,zero,xmm9[u,u,u,u,u,u,u,u,u] 4723; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 4724; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 4725; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4726; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u,u,u,u,u] 4727; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[1,8,15],zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] 4728; AVX-NEXT: vpor %xmm0, %xmm1, %xmm1 4729; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u,u,u] 4730; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,4,11],zero,zero,xmm15[u,u,u,u,u,u,u] 4731; AVX-NEXT: vpor %xmm0, %xmm2, %xmm2 4732; AVX-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] 4733; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 4734; AVX-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill 4735; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u],zero,zero,zero,xmm8[5,12,u,u,u,u,u,u,u,u,u] 4736; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,0,7,14],zero,zero,xmm9[u,u,u,u,u,u,u,u,u] 4737; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 4738; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm6[u,u,u,u] 4739; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15,u,u,u,u] 4740; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 4741; AVX-NEXT: vmovq {{.*#+}} xmm3 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0] 4742; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 4743; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4744; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] 4745; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4746; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm11[0,7,14,u,u,u,u,u,u,u,u,u,u,u] 4747; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 4748; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4749; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u],zero,zero,xmm4[3,10,u,u,u,u,u,u,u] 4750; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,5,12],zero,zero,xmm15[u,u,u,u,u,u,u] 4751; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 4752; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 4753; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4754; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] 4755; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm11[1,8,15,u,u,u,u,u,u,u,u,u,u,u] 4756; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 4757; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u],zero,zero,xmm4[4,11,u,u,u,u,u,u,u] 4758; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,6,13],zero,zero,xmm15[u,u,u,u,u,u,u] 4759; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 4760; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 4761; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4762; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u,u,u] 4763; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u,u,u] 4764; AVX-NEXT: vpor %xmm0, %xmm1, %xmm1 4765; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9,u,u,u,u,u] 4766; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,4,11],zero,zero,xmm6[u,u,u,u,u] 4767; AVX-NEXT: vpor %xmm0, %xmm2, %xmm2 4768; AVX-NEXT: vmovq {{.*#+}} xmm0 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0] 4769; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm2 4770; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u] 4771; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u],zero,zero,xmm8[0,7,14,u,u,u,u,u,u,u,u,u] 4772; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 4773; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[3,10,u,u,u,u,u] 4774; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,5,12],zero,zero,xmm6[u,u,u,u,u] 4775; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 4776; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm3, %xmm3 4777; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,3,10],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u] 4778; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u],zero,zero,xmm8[1,8,15,u,u,u,u,u,u,u,u,u] 4779; AVX-NEXT: vpor %xmm1, %xmm4, %xmm1 4780; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[4,11,u,u,u,u,u] 4781; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,6,13],zero,zero,xmm6[u,u,u,u,u] 4782; AVX-NEXT: vpor %xmm4, %xmm12, %xmm4 4783; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm4, %xmm4 4784; AVX-NEXT: vmovdqa 192(%rdi), %xmm5 4785; AVX-NEXT: vmovdqa 208(%rdi), %xmm1 4786; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[5,12] 4787; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero 4788; AVX-NEXT: vpor %xmm12, %xmm13, %xmm13 4789; AVX-NEXT: vpmovsxdq {{.*#+}} xmm12 = [18446744073709486080,16777215] 4790; AVX-NEXT: vpblendvb %xmm12, %xmm2, %xmm13, %xmm0 4791; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4792; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[6,13] 4793; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero 4794; AVX-NEXT: vpor %xmm2, %xmm14, %xmm2 4795; AVX-NEXT: vpblendvb %xmm12, %xmm3, %xmm2, %xmm0 4796; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4797; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero 4798; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14] 4799; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 4800; AVX-NEXT: vpblendvb %xmm12, %xmm4, %xmm2, %xmm0 4801; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4802; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 4803; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,4,11,u,u,u,u,u,u,u,u,u,u,u,u] 4804; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 4805; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u],zero,zero,zero,xmm7[5,12,u,u,u,u,u] 4806; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[u,u,u,u,u] 4807; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 4808; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3,4,5,6,7] 4809; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero 4810; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15] 4811; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 4812; AVX-NEXT: vpblendvb %xmm12, %xmm2, %xmm3, %xmm0 4813; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4814; AVX-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] 4815; AVX-NEXT: vpshufb %xmm10, %xmm5, %xmm2 4816; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] 4817; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 4818; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload 4819; AVX-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm2[6,7] 4820; AVX-NEXT: vbroadcastss {{.*#+}} xmm7 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] 4821; AVX-NEXT: vmovdqa 64(%rdi), %xmm6 4822; AVX-NEXT: vpshufb %xmm7, %xmm6, %xmm2 4823; AVX-NEXT: vmovdqa 80(%rdi), %xmm9 4824; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,4,11,u,u,u,u,u,u,u,u,u,u] 4825; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] 4826; AVX-NEXT: vmovdqa 96(%rdi), %xmm8 4827; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7,8,9] 4828; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,0,1,2,3,8,15] 4829; AVX-NEXT: vmovdqa 112(%rdi), %xmm2 4830; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[0,7,14,u,u,u,u,u,u,u,u,u,u,u,u,u] 4831; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 4832; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 4833; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload 4834; AVX-NEXT: vandnps %ymm12, %ymm13, %ymm12 4835; AVX-NEXT: vorps %ymm0, %ymm12, %ymm0 4836; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 4837; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] 4838; AVX-NEXT: vandnps %ymm3, %ymm12, %ymm3 4839; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0 4840; AVX-NEXT: vorps %ymm3, %ymm0, %ymm0 4841; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4842; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] 4843; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] 4844; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] 4845; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload 4846; AVX-NEXT: # xmm4 = mem[0,1,2,3,4,5],xmm0[6,7] 4847; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] 4848; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u] 4849; AVX-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] 4850; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 4851; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6],xmm3[7] 4852; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[3,10] 4853; AVX-NEXT: vpor %xmm14, %xmm13, %xmm13 4854; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] 4855; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 4856; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 4857; AVX-NEXT: vandnps (%rsp), %ymm14, %ymm0 # 32-byte Folded Reload 4858; AVX-NEXT: vandps %ymm14, %ymm13, %ymm13 4859; AVX-NEXT: vorps %ymm0, %ymm13, %ymm0 4860; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 4861; AVX-NEXT: vandnps %ymm4, %ymm12, %ymm4 4862; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0 4863; AVX-NEXT: vorps %ymm4, %ymm0, %ymm0 4864; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4865; AVX-NEXT: vpshufb %xmm10, %xmm1, %xmm0 4866; AVX-NEXT: vpshufb %xmm7, %xmm5, %xmm1 4867; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 4868; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 4869; AVX-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] 4870; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] 4871; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15] 4872; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] 4873; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7] 4874; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[4,11] 4875; AVX-NEXT: vpor %xmm4, %xmm1, %xmm1 4876; AVX-NEXT: vmovd {{.*#+}} xmm12 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 4877; AVX-NEXT: vpshufb %xmm12, %xmm2, %xmm4 4878; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm4 4879; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 4880; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload 4881; AVX-NEXT: vandnps %ymm4, %ymm1, %ymm4 4882; AVX-NEXT: vorps %ymm4, %ymm5, %ymm4 4883; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 4884; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] 4885; AVX-NEXT: vandnps %ymm5, %ymm13, %ymm5 4886; AVX-NEXT: vandps %ymm4, %ymm13, %ymm4 4887; AVX-NEXT: vorps %ymm5, %ymm4, %ymm0 4888; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4889; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm6[u,u] 4890; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14,u,u] 4891; AVX-NEXT: vpor %xmm4, %xmm7, %xmm4 4892; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm3[7] 4893; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[5,12] 4894; AVX-NEXT: vpor %xmm7, %xmm4, %xmm7 4895; AVX-NEXT: vmovd {{.*#+}} xmm4 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 4896; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm10 4897; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 4898; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload 4899; AVX-NEXT: vandnps %ymm7, %ymm1, %ymm7 4900; AVX-NEXT: vorps %ymm7, %ymm10, %ymm7 4901; AVX-NEXT: vandps %ymm7, %ymm13, %ymm7 4902; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 16-byte Folded Reload 4903; AVX-NEXT: vandnps %ymm10, %ymm13, %ymm10 4904; AVX-NEXT: vorps %ymm7, %ymm10, %ymm0 4905; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill 4906; AVX-NEXT: vpshufb %xmm12, %xmm11, %xmm7 4907; AVX-NEXT: vmovd {{.*#+}} xmm14 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 4908; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 4909; AVX-NEXT: vpshufb %xmm14, %xmm5, %xmm10 4910; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] 4911; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 4912; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u],zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u] 4913; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,u,0,7,14],zero,zero,xmm15[u,u,u,u,u,u,u] 4914; AVX-NEXT: vpor %xmm10, %xmm12, %xmm10 4915; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3,4,5,6,7] 4916; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm6[u,u] 4917; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15,u,u] 4918; AVX-NEXT: vpor %xmm10, %xmm12, %xmm10 4919; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5,6],xmm3[7] 4920; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[6,13] 4921; AVX-NEXT: vpor %xmm3, %xmm10, %xmm3 4922; AVX-NEXT: vpshufb %xmm14, %xmm2, %xmm10 4923; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm3 4924; AVX-NEXT: vandps %ymm1, %ymm7, %ymm7 4925; AVX-NEXT: vandnps %ymm3, %ymm1, %ymm3 4926; AVX-NEXT: vorps %ymm3, %ymm7, %ymm3 4927; AVX-NEXT: vandps %ymm3, %ymm13, %ymm3 4928; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload 4929; AVX-NEXT: vandnps %ymm7, %ymm13, %ymm7 4930; AVX-NEXT: vorps %ymm7, %ymm3, %ymm3 4931; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 4932; AVX-NEXT: vpshufb %xmm4, %xmm11, %xmm4 4933; AVX-NEXT: vmovd {{.*#+}} xmm7 = [5,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 4934; AVX-NEXT: vpshufb %xmm7, %xmm5, %xmm10 4935; AVX-NEXT: vmovdqa %xmm5, %xmm3 4936; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] 4937; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u],zero,zero,zero,xmm0[6,13,u,u,u,u,u,u,u] 4938; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,u,1,8,15],zero,zero,xmm15[u,u,u,u,u,u,u] 4939; AVX-NEXT: vpor %xmm10, %xmm12, %xmm10 4940; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3,4,5,6,7] 4941; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[2,9,u,u,u] 4942; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm6[u,u,u] 4943; AVX-NEXT: vpor %xmm10, %xmm12, %xmm10 4944; AVX-NEXT: vmovddup {{.*#+}} xmm12 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] 4945; AVX-NEXT: # xmm12 = mem[0,0] 4946; AVX-NEXT: vpshufb %xmm12, %xmm10, %xmm10 4947; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[0,7,14] 4948; AVX-NEXT: vpor %xmm5, %xmm10, %xmm5 4949; AVX-NEXT: vpshufb %xmm7, %xmm2, %xmm7 4950; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 4951; AVX-NEXT: vandps %ymm1, %ymm4, %ymm4 4952; AVX-NEXT: vandnps %ymm5, %ymm1, %ymm5 4953; AVX-NEXT: vorps %ymm5, %ymm4, %ymm4 4954; AVX-NEXT: vandps %ymm4, %ymm13, %ymm4 4955; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload 4956; AVX-NEXT: vandnps %ymm5, %ymm13, %ymm5 4957; AVX-NEXT: vorps %ymm5, %ymm4, %ymm4 4958; AVX-NEXT: vpshufb %xmm14, %xmm11, %xmm5 4959; AVX-NEXT: vmovd {{.*#+}} xmm7 = [6,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 4960; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm10 4961; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] 4962; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[u,u,u,u,2,9],zero,zero,zero,xmm15[u,u,u,u,u,u,u] 4963; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] 4964; AVX-NEXT: vpor %xmm10, %xmm11, %xmm10 4965; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm10[2,3,4,5,6,7] 4966; AVX-NEXT: vpshufb %xmm7, %xmm2, %xmm2 4967; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[3,10,u,u,u] 4968; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm6[u,u,u] 4969; AVX-NEXT: vpor %xmm7, %xmm6, %xmm6 4970; AVX-NEXT: vpshufb %xmm12, %xmm6, %xmm6 4971; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[1,8,15] 4972; AVX-NEXT: vpor %xmm7, %xmm6, %xmm6 4973; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 4974; AVX-NEXT: vandps %ymm1, %ymm5, %ymm5 4975; AVX-NEXT: vandnps %ymm2, %ymm1, %ymm1 4976; AVX-NEXT: vorps %ymm1, %ymm5, %ymm1 4977; AVX-NEXT: vandps %ymm1, %ymm13, %ymm1 4978; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload 4979; AVX-NEXT: vandnps %ymm2, %ymm13, %ymm0 4980; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 4981; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 4982; AVX-NEXT: vmovaps %ymm1, (%rsi) 4983; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 4984; AVX-NEXT: vmovaps %ymm1, (%rdx) 4985; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 4986; AVX-NEXT: vmovaps %ymm1, (%rcx) 4987; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload 4988; AVX-NEXT: vmovaps %ymm1, (%r8) 4989; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 4990; AVX-NEXT: vmovaps %ymm1, (%r9) 4991; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 4992; AVX-NEXT: vmovaps %ymm4, (%rax) 4993; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 4994; AVX-NEXT: vmovaps %ymm0, (%rax) 4995; AVX-NEXT: addq $200, %rsp 4996; AVX-NEXT: vzeroupper 4997; AVX-NEXT: retq 4998; 4999; AVX2-LABEL: load_i8_stride7_vf32: 5000; AVX2: # %bb.0: 5001; AVX2-NEXT: subq $72, %rsp 5002; AVX2-NEXT: vmovdqa 160(%rdi), %ymm10 5003; AVX2-NEXT: vmovdqa 128(%rdi), %ymm11 5004; AVX2-NEXT: vmovdqa (%rdi), %ymm6 5005; AVX2-NEXT: vmovdqa 32(%rdi), %ymm7 5006; AVX2-NEXT: vmovdqa 64(%rdi), %ymm13 5007; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3 5008; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 5009; AVX2-NEXT: vpblendvb %ymm15, %ymm6, %ymm7, %ymm0 5010; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 5011; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] 5012; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] 5013; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 5014; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 5015; AVX2-NEXT: vpblendvb %ymm14, %ymm3, %ymm13, %ymm1 5016; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 5017; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] 5018; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] 5019; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] 5020; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 5021; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 5022; AVX2-NEXT: vpblendvb %ymm1, %ymm10, %ymm11, %ymm1 5023; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 5024; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] 5025; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] 5026; AVX2-NEXT: vpor %xmm4, %xmm1, %xmm1 5027; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 5028; AVX2-NEXT: vpblendvb %ymm2, %ymm6, %ymm7, %ymm4 5029; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 5030; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] 5031; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u] 5032; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm4 5033; AVX2-NEXT: vpblendvb %ymm15, %ymm13, %ymm3, %ymm5 5034; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm8 5035; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4,5],ymm8[6],ymm5[7,8,9],ymm8[10],ymm5[11,12,13],ymm8[14],ymm5[15] 5036; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] 5037; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,18446744073709551360,16777215,0] 5038; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm8 5039; AVX2-NEXT: vmovdqa 192(%rdi), %xmm4 5040; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] 5041; AVX2-NEXT: vmovdqa 208(%rdi), %xmm5 5042; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] 5043; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] 5044; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5045; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 5046; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7] 5047; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm9 = [18446744073709551615,18446744073709551615,16777215,0] 5048; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 5049; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5050; AVX2-NEXT: vpblendvb %ymm14, %ymm10, %ymm11, %ymm0 5051; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u] 5052; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 5053; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u] 5054; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 5055; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] 5056; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] 5057; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] 5058; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 5059; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5060; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] 5061; AVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm0 5062; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 5063; AVX2-NEXT: vpblendvb %ymm2, %ymm11, %ymm10, %ymm0 5064; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 5065; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u] 5066; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] 5067; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 5068; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12] 5069; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero 5070; AVX2-NEXT: vpor %xmm1, %xmm8, %xmm1 5071; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 5072; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5073; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] 5074; AVX2-NEXT: # ymm0 = mem[0,1,0,1] 5075; AVX2-NEXT: vpblendvb %ymm0, %ymm8, %ymm1, %ymm1 5076; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5077; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 5078; AVX2-NEXT: vpblendvb %ymm9, %ymm11, %ymm10, %ymm1 5079; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u] 5080; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 5081; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u] 5082; AVX2-NEXT: vpor %xmm1, %xmm8, %xmm1 5083; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13] 5084; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero 5085; AVX2-NEXT: vpor %xmm8, %xmm12, %xmm8 5086; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5087; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 5088; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm8, %ymm1 5089; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5090; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] 5091; AVX2-NEXT: vpblendvb %ymm8, %ymm11, %ymm10, %ymm1 5092; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u] 5093; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 5094; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u] 5095; AVX2-NEXT: vpor %xmm1, %xmm12, %xmm1 5096; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero 5097; AVX2-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14] 5098; AVX2-NEXT: vpor %xmm12, %xmm14, %xmm12 5099; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5100; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 5101; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm12, %ymm1 5102; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5103; AVX2-NEXT: vpblendvb %ymm15, %ymm11, %ymm10, %ymm14 5104; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 5105; AVX2-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 5106; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm11 5107; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u] 5108; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u] 5109; AVX2-NEXT: vpor %xmm11, %xmm10, %xmm10 5110; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero 5111; AVX2-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15] 5112; AVX2-NEXT: vpor %xmm11, %xmm15, %xmm11 5113; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 5114; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 5115; AVX2-NEXT: vpblendvb %ymm0, %ymm10, %ymm11, %ymm0 5116; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5117; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 5118; AVX2-NEXT: vpblendvb %ymm0, %ymm7, %ymm6, %ymm10 5119; AVX2-NEXT: vpblendvb %ymm9, %ymm6, %ymm7, %ymm11 5120; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm15 5121; AVX2-NEXT: vpblendvb %ymm12, %ymm7, %ymm6, %ymm1 5122; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 5123; AVX2-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6 5124; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 5125; AVX2-NEXT: vpblendvb %ymm0, %ymm13, %ymm3, %ymm0 5126; AVX2-NEXT: vpblendvb %ymm9, %ymm13, %ymm3, %ymm9 5127; AVX2-NEXT: vpblendvb %ymm8, %ymm13, %ymm3, %ymm8 5128; AVX2-NEXT: vpblendvb %ymm12, %ymm3, %ymm13, %ymm7 5129; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm13, %ymm2 5130; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u] 5131; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm11 5132; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u] 5133; AVX2-NEXT: vpor %xmm3, %xmm11, %xmm3 5134; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm11 5135; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7,8,9,10],ymm11[11],ymm0[12,13],ymm11[14],ymm0[15] 5136; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 5137; AVX2-NEXT: vpmovsxwq {{.*#+}} xmm11 = [18446744073709551615,255] 5138; AVX2-NEXT: vpblendvb %ymm11, %ymm3, %ymm0, %ymm0 5139; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u] 5140; AVX2-NEXT: vextracti128 $1, %ymm15, %xmm13 5141; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u,u,u] 5142; AVX2-NEXT: vpor %xmm3, %xmm13, %xmm3 5143; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm13 5144; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1,2],ymm13[3],ymm9[4,5,6],ymm13[7,8],ymm9[9,10],ymm13[11],ymm9[12,13,14],ymm13[15] 5145; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 5146; AVX2-NEXT: vpblendvb %ymm11, %ymm3, %ymm9, %ymm3 5147; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm9 5148; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u] 5149; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u] 5150; AVX2-NEXT: vpor %xmm1, %xmm9, %xmm1 5151; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 5152; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3],ymm9[4],ymm8[5,6],ymm9[7,8],ymm8[9,10,11],ymm9[12],ymm8[13,14],ymm9[15] 5153; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 5154; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm8, %ymm9 5155; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm1 5156; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u] 5157; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u] 5158; AVX2-NEXT: vpor %xmm1, %xmm6, %xmm1 5159; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm6 5160; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] 5161; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 5162; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm6, %ymm1 5163; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] 5164; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm7 5165; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u] 5166; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 5167; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm7 5168; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7,8],ymm7[9],ymm2[10,11,12],ymm7[13],ymm2[14,15] 5169; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 5170; AVX2-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm2 5171; AVX2-NEXT: vextracti128 $1, %ymm14, %xmm6 5172; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u] 5173; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] 5174; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 5175; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] 5176; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] 5177; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 5178; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 5179; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 5180; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] 5181; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15] 5182; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] 5183; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload 5184; AVX2-NEXT: # ymm4 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] 5185; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] 5186; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload 5187; AVX2-NEXT: # ymm4 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15] 5188; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] 5189; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload 5190; AVX2-NEXT: # ymm5 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] 5191; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] 5192; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload 5193; AVX2-NEXT: # ymm5 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] 5194; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] 5195; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5196; AVX2-NEXT: vmovaps %ymm5, (%rsi) 5197; AVX2-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload 5198; AVX2-NEXT: vmovaps %ymm5, (%rdx) 5199; AVX2-NEXT: vmovdqa %ymm0, (%rcx) 5200; AVX2-NEXT: vmovdqa %ymm3, (%r8) 5201; AVX2-NEXT: vmovdqa %ymm4, (%r9) 5202; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 5203; AVX2-NEXT: vmovdqa %ymm1, (%rax) 5204; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 5205; AVX2-NEXT: vmovdqa %ymm2, (%rax) 5206; AVX2-NEXT: addq $72, %rsp 5207; AVX2-NEXT: vzeroupper 5208; AVX2-NEXT: retq 5209; 5210; AVX2-FP-LABEL: load_i8_stride7_vf32: 5211; AVX2-FP: # %bb.0: 5212; AVX2-FP-NEXT: subq $72, %rsp 5213; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm10 5214; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm11 5215; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm6 5216; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm7 5217; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm13 5218; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm3 5219; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 5220; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm6, %ymm7, %ymm0 5221; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 5222; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] 5223; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] 5224; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 5225; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 5226; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm3, %ymm13, %ymm1 5227; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm4 5228; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] 5229; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] 5230; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] 5231; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 5232; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 5233; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm10, %ymm11, %ymm1 5234; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm4 5235; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] 5236; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] 5237; AVX2-FP-NEXT: vpor %xmm4, %xmm1, %xmm1 5238; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 5239; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm6, %ymm7, %ymm4 5240; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5 5241; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] 5242; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u] 5243; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm4 5244; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm13, %ymm3, %ymm5 5245; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm8 5246; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4,5],ymm8[6],ymm5[7,8,9],ymm8[10],ymm5[11,12,13],ymm8[14],ymm5[15] 5247; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] 5248; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,18446744073709551360,16777215,0] 5249; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm8 5250; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm4 5251; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] 5252; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm5 5253; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] 5254; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] 5255; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5256; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 5257; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7] 5258; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm9 = [18446744073709551615,18446744073709551615,16777215,0] 5259; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 5260; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5261; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm10, %ymm11, %ymm0 5262; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u] 5263; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0 5264; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u] 5265; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 5266; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] 5267; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] 5268; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] 5269; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 5270; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5271; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] 5272; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm0 5273; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 5274; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm11, %ymm10, %ymm0 5275; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 5276; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u] 5277; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] 5278; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 5279; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12] 5280; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero 5281; AVX2-FP-NEXT: vpor %xmm1, %xmm8, %xmm1 5282; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 5283; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5284; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] 5285; AVX2-FP-NEXT: # ymm0 = mem[0,1,0,1] 5286; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm8, %ymm1, %ymm1 5287; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5288; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 5289; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm11, %ymm10, %ymm1 5290; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u] 5291; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1 5292; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u] 5293; AVX2-FP-NEXT: vpor %xmm1, %xmm8, %xmm1 5294; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13] 5295; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero 5296; AVX2-FP-NEXT: vpor %xmm8, %xmm12, %xmm8 5297; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5298; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 5299; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm8, %ymm1 5300; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5301; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] 5302; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm11, %ymm10, %ymm1 5303; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u] 5304; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1 5305; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u] 5306; AVX2-FP-NEXT: vpor %xmm1, %xmm12, %xmm1 5307; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero 5308; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14] 5309; AVX2-FP-NEXT: vpor %xmm12, %xmm14, %xmm12 5310; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5311; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 5312; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm12, %ymm1 5313; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5314; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm11, %ymm10, %ymm14 5315; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 5316; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 5317; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm11 5318; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u] 5319; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u] 5320; AVX2-FP-NEXT: vpor %xmm11, %xmm10, %xmm10 5321; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero 5322; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15] 5323; AVX2-FP-NEXT: vpor %xmm11, %xmm15, %xmm11 5324; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 5325; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 5326; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm10, %ymm11, %ymm0 5327; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5328; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 5329; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm7, %ymm6, %ymm10 5330; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm7, %ymm11 5331; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm15 5332; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm7, %ymm6, %ymm1 5333; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 5334; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6 5335; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 5336; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm13, %ymm3, %ymm0 5337; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm13, %ymm3, %ymm9 5338; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm13, %ymm3, %ymm8 5339; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm3, %ymm13, %ymm7 5340; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm3, %ymm13, %ymm2 5341; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u] 5342; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm11 5343; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u] 5344; AVX2-FP-NEXT: vpor %xmm3, %xmm11, %xmm3 5345; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm11 5346; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7,8,9,10],ymm11[11],ymm0[12,13],ymm11[14],ymm0[15] 5347; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 5348; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm11 = [18446744073709551615,255] 5349; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm3, %ymm0, %ymm0 5350; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u] 5351; AVX2-FP-NEXT: vextracti128 $1, %ymm15, %xmm13 5352; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u,u,u] 5353; AVX2-FP-NEXT: vpor %xmm3, %xmm13, %xmm3 5354; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm13 5355; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1,2],ymm13[3],ymm9[4,5,6],ymm13[7,8],ymm9[9,10],ymm13[11],ymm9[12,13,14],ymm13[15] 5356; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 5357; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm3, %ymm9, %ymm3 5358; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm9 5359; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u] 5360; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u] 5361; AVX2-FP-NEXT: vpor %xmm1, %xmm9, %xmm1 5362; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9 5363; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3],ymm9[4],ymm8[5,6],ymm9[7,8],ymm8[9,10,11],ymm9[12],ymm8[13,14],ymm9[15] 5364; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 5365; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm8, %ymm9 5366; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm1 5367; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u] 5368; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u] 5369; AVX2-FP-NEXT: vpor %xmm1, %xmm6, %xmm1 5370; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm6 5371; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] 5372; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 5373; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm6, %ymm1 5374; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] 5375; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm7 5376; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u] 5377; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 5378; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm7 5379; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7,8],ymm7[9],ymm2[10,11,12],ymm7[13],ymm2[14,15] 5380; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 5381; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm2 5382; AVX2-FP-NEXT: vextracti128 $1, %ymm14, %xmm6 5383; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u] 5384; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] 5385; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 5386; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] 5387; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] 5388; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 5389; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 5390; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 5391; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] 5392; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15] 5393; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] 5394; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload 5395; AVX2-FP-NEXT: # ymm4 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] 5396; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] 5397; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload 5398; AVX2-FP-NEXT: # ymm4 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15] 5399; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] 5400; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload 5401; AVX2-FP-NEXT: # ymm5 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] 5402; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] 5403; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload 5404; AVX2-FP-NEXT: # ymm5 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] 5405; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] 5406; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5407; AVX2-FP-NEXT: vmovaps %ymm5, (%rsi) 5408; AVX2-FP-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload 5409; AVX2-FP-NEXT: vmovaps %ymm5, (%rdx) 5410; AVX2-FP-NEXT: vmovdqa %ymm0, (%rcx) 5411; AVX2-FP-NEXT: vmovdqa %ymm3, (%r8) 5412; AVX2-FP-NEXT: vmovdqa %ymm4, (%r9) 5413; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 5414; AVX2-FP-NEXT: vmovdqa %ymm1, (%rax) 5415; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 5416; AVX2-FP-NEXT: vmovdqa %ymm2, (%rax) 5417; AVX2-FP-NEXT: addq $72, %rsp 5418; AVX2-FP-NEXT: vzeroupper 5419; AVX2-FP-NEXT: retq 5420; 5421; AVX2-FCP-LABEL: load_i8_stride7_vf32: 5422; AVX2-FCP: # %bb.0: 5423; AVX2-FCP-NEXT: subq $40, %rsp 5424; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm10 5425; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm8 5426; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm9 5427; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm5 5428; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 5429; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 5430; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 5431; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 5432; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm5, %ymm6, %ymm0 5433; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 5434; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] 5435; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] 5436; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 5437; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 5438; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm4, %ymm3, %ymm1 5439; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 5440; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] 5441; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] 5442; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] 5443; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 5444; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 5445; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm8, %ymm9, %ymm1 5446; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 5447; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u] 5448; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] 5449; AVX2-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 5450; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5451; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,2,4,6] 5452; AVX2-FCP-NEXT: vpermd %ymm10, %ymm2, %ymm2 5453; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5454; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] 5455; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] 5456; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm2 = [18446744073709551615,18446744073709551615,16777215,0] 5457; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 5458; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 5459; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 5460; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm5, %ymm6, %ymm0 5461; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 5462; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u,u,u] 5463; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u,u,u] 5464; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 5465; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm3, %ymm4, %ymm1 5466; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm7 5467; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm7[2],ymm1[3,4,5],ymm7[6],ymm1[7,8,9],ymm7[10],ymm1[11,12,13],ymm7[14],ymm1[15] 5468; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] 5469; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm7 = [0,18446744073709551360,16777215,0] 5470; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 5471; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm1 5472; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u] 5473; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 5474; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u] 5475; AVX2-FCP-NEXT: vpor %xmm7, %xmm1, %xmm1 5476; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5477; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,3,4,6] 5478; AVX2-FCP-NEXT: vpermd %ymm10, %ymm7, %ymm7 5479; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] 5480; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7] 5481; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 5482; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5483; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm9, %ymm8, %ymm0 5484; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 5485; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u] 5486; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] 5487; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm1 5488; AVX2-FCP-NEXT: vmovdqa 208(%rdi), %xmm2 5489; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12] 5490; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm0 5491; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero 5492; AVX2-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 5493; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm10 5494; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 5495; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] 5496; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1] 5497; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm10, %ymm7, %ymm7 5498; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5499; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 5500; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm9, %ymm8, %ymm7 5501; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u] 5502; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 5503; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u] 5504; AVX2-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 5505; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13] 5506; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero 5507; AVX2-FCP-NEXT: vpor %xmm10, %xmm14, %xmm10 5508; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 5509; AVX2-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 5510; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm7, %ymm10, %ymm7 5511; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5512; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] 5513; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm7 5514; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u] 5515; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 5516; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u] 5517; AVX2-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7 5518; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero 5519; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14] 5520; AVX2-FCP-NEXT: vpor %xmm15, %xmm12, %xmm12 5521; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 5522; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 5523; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm7, %ymm12, %ymm15 5524; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 5525; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm9, %ymm8, %ymm12 5526; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 5527; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm8 5528; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 5529; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u] 5530; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u] 5531; AVX2-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8 5532; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero 5533; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15] 5534; AVX2-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 5535; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm2 5536; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 5537; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm14 5538; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm6, %ymm5, %ymm11 5539; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm5, %ymm6, %ymm2 5540; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm5, %ymm6, %ymm8 5541; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm6, %ymm5, %ymm9 5542; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 5543; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm6, %ymm5, %ymm5 5544; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 5545; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm0 5546; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm3, %ymm4, %ymm13 5547; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm10 5548; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm6 5549; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm4, %ymm3, %ymm3 5550; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] 5551; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 5552; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] 5553; AVX2-FCP-NEXT: vpor %xmm4, %xmm2, %xmm2 5554; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 5555; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8,9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] 5556; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 5557; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255] 5558; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm2 5559; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u] 5560; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm4 5561; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u,u,u] 5562; AVX2-FCP-NEXT: vpor %xmm0, %xmm4, %xmm0 5563; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm4 5564; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm13[1,2],ymm4[3],ymm13[4,5,6],ymm4[7,8],ymm13[9,10],ymm4[11],ymm13[12,13,14],ymm4[15] 5565; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 5566; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm4 5567; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm0 5568; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[2,9],zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u] 5569; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u,u,u] 5570; AVX2-FCP-NEXT: vpor %xmm0, %xmm8, %xmm0 5571; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm8 5572; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm10[1,2,3],ymm8[4],ymm10[5,6],ymm8[7,8],ymm10[9,10,11],ymm8[12],ymm10[13,14],ymm8[15] 5573; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 5574; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm8, %ymm0 5575; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm8 5576; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u] 5577; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[5,12],zero,zero,xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u] 5578; AVX2-FCP-NEXT: vpor %xmm5, %xmm8, %xmm5 5579; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8 5580; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6,7,8],ymm8[9],ymm6[10,11],ymm8[12],ymm6[13,14,15] 5581; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 5582; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 5583; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u] 5584; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm1 5585; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] 5586; AVX2-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1 5587; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 5588; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7,8],ymm6[9],ymm3[10,11,12],ymm6[13],ymm3[14,15] 5589; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 5590; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm1 5591; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm3 5592; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u] 5593; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] 5594; AVX2-FCP-NEXT: vpor %xmm3, %xmm6, %xmm3 5595; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,1,3,5,6] 5596; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload 5597; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 5598; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] 5599; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] 5600; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] 5601; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 5602; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload 5603; AVX2-FCP-NEXT: # ymm3 = ymm4[0],mem[1,2,3,4,5,6,7],ymm4[8],mem[9,10,11,12,13,14,15] 5604; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 5605; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload 5606; AVX2-FCP-NEXT: # ymm4 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] 5607; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] 5608; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm15[1,2,3,4,5,6,7],ymm5[8],ymm15[9,10,11,12,13,14,15] 5609; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] 5610; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0],ymm14[1,2,3,4,5,6,7],ymm1[8],ymm14[9,10,11,12,13,14,15] 5611; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] 5612; AVX2-FCP-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload 5613; AVX2-FCP-NEXT: vmovaps %ymm5, (%rsi) 5614; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 5615; AVX2-FCP-NEXT: vmovaps %ymm5, (%rdx) 5616; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rcx) 5617; AVX2-FCP-NEXT: vmovdqa %ymm3, (%r8) 5618; AVX2-FCP-NEXT: vmovdqa %ymm0, (%r9) 5619; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 5620; AVX2-FCP-NEXT: vmovdqa %ymm4, (%rax) 5621; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 5622; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rax) 5623; AVX2-FCP-NEXT: addq $40, %rsp 5624; AVX2-FCP-NEXT: vzeroupper 5625; AVX2-FCP-NEXT: retq 5626; 5627; AVX512-LABEL: load_i8_stride7_vf32: 5628; AVX512: # %bb.0: 5629; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 5630; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 5631; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] 5632; AVX512-NEXT: vmovdqa 128(%rdi), %ymm2 5633; AVX512-NEXT: vmovdqa 160(%rdi), %ymm3 5634; AVX512-NEXT: vmovdqa %ymm0, %ymm1 5635; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm3 ^ ymm2)) 5636; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm4 5637; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] 5638; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] 5639; AVX512-NEXT: vpor %xmm4, %xmm1, %xmm1 5640; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5641; AVX512-NEXT: vmovdqa 192(%rdi), %xmm4 5642; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] 5643; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm6 5644; AVX512-NEXT: vmovdqa64 %xmm5, %xmm20 5645; AVX512-NEXT: vmovdqa 208(%rdi), %xmm5 5646; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] 5647; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] 5648; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 5649; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5,6],ymm6[7] 5650; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] 5651; AVX512-NEXT: vmovdqa (%rdi), %ymm6 5652; AVX512-NEXT: vmovdqa 32(%rdi), %ymm7 5653; AVX512-NEXT: vmovdqa 64(%rdi), %ymm1 5654; AVX512-NEXT: vmovdqa %ymm14, %ymm9 5655; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm7 ^ (ymm9 & (ymm6 ^ ymm7)) 5656; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm10 5657; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u,u,u,u,u] 5658; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,7,14],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[u,u,u,u,u,u] 5659; AVX512-NEXT: vpor %xmm10, %xmm9, %xmm13 5660; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] 5661; AVX512-NEXT: vmovdqa 96(%rdi), %ymm9 5662; AVX512-NEXT: vmovdqa %ymm11, %ymm15 5663; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm9 ^ ymm1)) 5664; AVX512-NEXT: vmovdqa 80(%rdi), %xmm10 5665; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15] 5666; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] 5667; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm13 & mem) 5668; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] 5669; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm12 ^ (ymm16 & (ymm8 ^ ymm12)) 5670; AVX512-NEXT: vmovdqa64 %ymm8, %ymm18 5671; AVX512-NEXT: vmovdqa %ymm11, %ymm12 5672; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm2 ^ (ymm12 & (ymm3 ^ ymm2)) 5673; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,6,13],zero,zero,xmm12[2,9],zero,zero,zero,xmm12[u,u,u,u] 5674; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm12 5675; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[4,11],zero,zero,xmm12[0,7,14,u,u,u,u] 5676; AVX512-NEXT: vpor %xmm13, %xmm12, %xmm12 5677; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 5678; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] 5679; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] 5680; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3] 5681; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 5682; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7] 5683; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] 5684; AVX512-NEXT: vmovdqa %ymm13, %ymm12 5685; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm7 ^ (ymm12 & (ymm6 ^ ymm7)) 5686; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm15 5687; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u] 5688; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[1,8,15],zero,zero,xmm12[4,11],zero,zero,xmm12[u,u,u,u,u,u,u] 5689; AVX512-NEXT: vpor %xmm15, %xmm12, %xmm15 5690; AVX512-NEXT: vmovdqa %ymm14, %ymm12 5691; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm9 ^ (ymm12 & (ymm1 ^ ymm9)) 5692; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7,8,9],ymm10[10],ymm12[11,12,13],ymm10[14],ymm12[15] 5693; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] 5694; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 | (ymm15 & ~mem) 5695; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm8 ^ (ymm16 & (ymm12 ^ ymm8)) 5696; AVX512-NEXT: vmovdqa64 %ymm12, %ymm19 5697; AVX512-NEXT: vmovdqa %ymm0, %ymm8 5698; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm8 & (ymm6 ^ ymm7)) 5699; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u] 5700; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm8 5701; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u] 5702; AVX512-NEXT: vpor %xmm15, %xmm8, %xmm8 5703; AVX512-NEXT: vmovdqa %ymm13, %ymm15 5704; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm9 ^ (ymm15 & (ymm1 ^ ymm9)) 5705; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8,9,10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15] 5706; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 5707; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615] 5708; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm8 & ymm17) 5709; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm3 ^ (ymm14 & (ymm2 ^ ymm3)) 5710; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm8 5711; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u] 5712; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] 5713; AVX512-NEXT: vpor %xmm8, %xmm14, %xmm8 5714; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 5715; AVX512-NEXT: vmovdqa64 %xmm20, %xmm12 5716; AVX512-NEXT: vpshufb %xmm12, %xmm5, %xmm14 5717; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] 5718; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] 5719; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 5720; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7] 5721; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm8[1,2,3,4,5,6,7],ymm15[8],ymm8[9,10,11,12,13,14,15] 5722; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] 5723; AVX512-NEXT: vmovdqa64 %ymm8, %ymm20 5724; AVX512-NEXT: vmovdqa %ymm13, %ymm8 5725; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3)) 5726; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm12 5727; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u] 5728; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u] 5729; AVX512-NEXT: vpor %xmm12, %xmm8, %xmm8 5730; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 5731; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12] 5732; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero 5733; AVX512-NEXT: vpor %xmm12, %xmm14, %xmm12 5734; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 5735; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] 5736; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8)) 5737; AVX512-NEXT: vmovdqa %ymm11, %ymm8 5738; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm8 & (ymm6 ^ ymm7)) 5739; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u] 5740; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm8 5741; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u] 5742; AVX512-NEXT: vpor %xmm14, %xmm8, %xmm8 5743; AVX512-NEXT: vmovdqa %ymm0, %ymm14 5744; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm1 ^ ymm9)) 5745; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2],ymm10[3],ymm14[4,5,6],ymm10[7,8],ymm14[9,10],ymm10[11],ymm14[12,13,14],ymm10[15] 5746; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 5747; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm8 & ymm17) 5748; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15] 5749; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] 5750; AVX512-NEXT: vmovdqa64 %ymm8, %ymm21 5751; AVX512-NEXT: vmovdqa %ymm0, %ymm8 5752; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3)) 5753; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u] 5754; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm8 5755; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u] 5756; AVX512-NEXT: vpor %xmm12, %xmm8, %xmm8 5757; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 5758; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13] 5759; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero 5760; AVX512-NEXT: vpor %xmm12, %xmm14, %xmm12 5761; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 5762; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8)) 5763; AVX512-NEXT: vmovdqa %ymm13, %ymm8 5764; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm7 ^ ymm6)) 5765; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm14 5766; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u] 5767; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u] 5768; AVX512-NEXT: vpor %xmm14, %xmm8, %xmm8 5769; AVX512-NEXT: vmovdqa %ymm11, %ymm14 5770; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm1 ^ ymm9)) 5771; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2,3],ymm10[4],ymm14[5,6],ymm10[7,8],ymm14[9,10,11],ymm10[12],ymm14[13,14],ymm10[15] 5772; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 5773; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm8 & ymm17) 5774; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15] 5775; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm8[4,5,6,7] 5776; AVX512-NEXT: vmovdqa %ymm11, %ymm8 5777; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3)) 5778; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u] 5779; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm8 5780; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u] 5781; AVX512-NEXT: vpor %xmm12, %xmm8, %xmm8 5782; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 5783; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero 5784; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14] 5785; AVX512-NEXT: vpor %xmm12, %xmm15, %xmm12 5786; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 5787; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8)) 5788; AVX512-NEXT: vmovdqa %ymm0, %ymm8 5789; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm7 ^ ymm6)) 5790; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm15 5791; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u] 5792; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u] 5793; AVX512-NEXT: vpor %xmm15, %xmm8, %xmm8 5794; AVX512-NEXT: vmovdqa %ymm13, %ymm15 5795; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm9 ^ ymm1)) 5796; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6,7,8],ymm10[9],ymm15[10,11],ymm10[12],ymm15[13,14,15] 5797; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 5798; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm8 & ymm17) 5799; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm12[1,2,3,4,5,6,7],ymm15[8],ymm12[9,10,11,12,13,14,15] 5800; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] 5801; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm2 ^ (ymm13 & (ymm3 ^ ymm2)) 5802; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm2 5803; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u] 5804; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u] 5805; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 5806; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 5807; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero 5808; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15] 5809; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 5810; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 5811; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm16 & (ymm3 ^ ymm2)) 5812; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm6 ^ (ymm11 & (ymm7 ^ ymm6)) 5813; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u] 5814; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm4 5815; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u] 5816; AVX512-NEXT: vpor %xmm2, %xmm4, %xmm2 5817; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm9 ^ ymm1)) 5818; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7,8],ymm10[9],ymm0[10,11,12],ymm10[13],ymm0[14,15] 5819; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 5820; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm17) 5821; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15] 5822; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 5823; AVX512-NEXT: vmovdqa64 %ymm18, (%rsi) 5824; AVX512-NEXT: vmovdqa64 %ymm19, (%rdx) 5825; AVX512-NEXT: vmovdqa64 %ymm20, (%rcx) 5826; AVX512-NEXT: vmovdqa64 %ymm21, (%r8) 5827; AVX512-NEXT: vmovdqa %ymm14, (%r9) 5828; AVX512-NEXT: vmovdqa %ymm8, (%r10) 5829; AVX512-NEXT: vmovdqa %ymm0, (%rax) 5830; AVX512-NEXT: vzeroupper 5831; AVX512-NEXT: retq 5832; 5833; AVX512-FCP-LABEL: load_i8_stride7_vf32: 5834; AVX512-FCP: # %bb.0: 5835; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 5836; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 5837; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] 5838; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 5839; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm3 5840; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 5841; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm3 ^ ymm2)) 5842; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 5843; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] 5844; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] 5845; AVX512-FCP-NEXT: vpor %xmm4, %xmm1, %xmm1 5846; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 5847; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,2,4,6] 5848; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm12 5849; AVX512-FCP-NEXT: vpermd %ymm12, %ymm4, %ymm4 5850; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] 5851; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5,6],ymm4[7] 5852; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] 5853; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4 5854; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 5855; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 5856; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm7 5857; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm4 ^ ymm5)) 5858; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 5859; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u,u,u] 5860; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u,u,u] 5861; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm10 5862; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] 5863; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 5864; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm11 5865; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm1 ^ (ymm11 & (ymm7 ^ ymm1)) 5866; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm8 5867; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7,8,9],ymm8[10],ymm11[11,12],ymm8[13],ymm11[14,15] 5868; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] 5869; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm10 & mem) 5870; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] 5871; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm6 ^ (ymm16 & (ymm11 ^ ymm6)) 5872; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm18 5873; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm6 5874; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm3 ^ ymm2)) 5875; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u] 5876; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 5877; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u] 5878; AVX512-FCP-NEXT: vpor %xmm6, %xmm10, %xmm6 5879; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 5880; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,3,4,6] 5881; AVX512-FCP-NEXT: vpermd %ymm12, %ymm10, %ymm10 5882; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] 5883; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] 5884; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] 5885; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm10 5886; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm10 & (ymm4 ^ ymm5)) 5887; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm14 5888; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u,u,u] 5889; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[1,8,15],zero,zero,xmm10[4,11],zero,zero,xmm10[u,u,u,u,u,u,u] 5890; AVX512-FCP-NEXT: vpor %xmm14, %xmm10, %xmm14 5891; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm10 5892; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm7 ^ (ymm10 & (ymm1 ^ ymm7)) 5893; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm8[2],ymm10[3,4,5],ymm8[6],ymm10[7,8,9],ymm8[10],ymm10[11,12,13],ymm8[14],ymm10[15] 5894; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] 5895; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm14 & ~mem) 5896; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm16 & (ymm10 ^ ymm6)) 5897; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm6 5898; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5)) 5899; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u] 5900; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 5901; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u] 5902; AVX512-FCP-NEXT: vpor %xmm6, %xmm14, %xmm6 5903; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm14 5904; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm7 ^ (ymm14 & (ymm1 ^ ymm7)) 5905; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7,8,9,10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15] 5906; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 5907; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615] 5908; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm6 & ymm17) 5909; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm2 ^ ymm3)) 5910; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm6 5911; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u] 5912; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u] 5913; AVX512-FCP-NEXT: vpor %xmm6, %xmm13, %xmm6 5914; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 5915; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,5,6] 5916; AVX512-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12 5917; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] 5918; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7] 5919; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1,2,3,4,5,6,7],ymm14[8],ymm6[9,10,11,12,13,14,15] 5920; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5,6,7] 5921; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm19 5922; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm6 5923; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3)) 5924; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm12 5925; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u] 5926; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u] 5927; AVX512-FCP-NEXT: vpor %xmm6, %xmm12, %xmm6 5928; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 5929; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm14 5930; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,12] 5931; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm12 5932; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero 5933; AVX512-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13 5934; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 5935; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] 5936; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6)) 5937; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm6 5938; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5)) 5939; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u,u,u] 5940; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 5941; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u,u,u] 5942; AVX512-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6 5943; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm15 5944; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm7 ^ (ymm15 & (ymm1 ^ ymm7)) 5945; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2],ymm8[3],ymm15[4,5,6],ymm8[7,8],ymm15[9,10],ymm8[11],ymm15[12,13,14],ymm8[15] 5946; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 5947; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17) 5948; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15] 5949; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7] 5950; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm20 5951; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm6 5952; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3)) 5953; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u] 5954; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 5955; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u] 5956; AVX512-FCP-NEXT: vpor %xmm6, %xmm13, %xmm6 5957; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 5958; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[6,13] 5959; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero 5960; AVX512-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13 5961; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 5962; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6)) 5963; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm6 5964; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm4 ^ (ymm6 & (ymm5 ^ ymm4)) 5965; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15 5966; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u] 5967; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u] 5968; AVX512-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6 5969; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm15 5970; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm7 ^ (ymm15 & (ymm1 ^ ymm7)) 5971; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2,3],ymm8[4],ymm15[5,6],ymm8[7,8],ymm15[9,10,11],ymm8[12],ymm15[13,14],ymm8[15] 5972; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 5973; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17) 5974; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15] 5975; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7] 5976; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm21 5977; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm6 5978; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3)) 5979; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u] 5980; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 5981; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u] 5982; AVX512-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6 5983; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 5984; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero 5985; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14] 5986; AVX512-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 5987; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 5988; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6)) 5989; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm6 5990; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm4 ^ (ymm6 & (ymm5 ^ ymm4)) 5991; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15 5992; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u] 5993; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u] 5994; AVX512-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6 5995; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm15 5996; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm7 ^ ymm1)) 5997; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm8[1],ymm15[2,3],ymm8[4],ymm15[5,6,7,8],ymm8[9],ymm15[10,11],ymm8[12],ymm15[13,14,15] 5998; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 5999; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17) 6000; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15] 6001; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7] 6002; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm2 ^ (ymm11 & (ymm3 ^ ymm2)) 6003; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm2 6004; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u] 6005; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u] 6006; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 6007; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 6008; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero 6009; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15] 6010; AVX512-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3 6011; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 6012; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm16 & (ymm3 ^ ymm2)) 6013; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm4 ^ (ymm9 & (ymm5 ^ ymm4)) 6014; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u] 6015; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm4 6016; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u] 6017; AVX512-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2 6018; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm7 ^ ymm1)) 6019; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7,8],ymm8[9],ymm0[10,11,12],ymm8[13],ymm0[14,15] 6020; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6021; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm17) 6022; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15] 6023; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6024; AVX512-FCP-NEXT: vmovdqa64 %ymm18, (%rsi) 6025; AVX512-FCP-NEXT: vmovdqa %ymm10, (%rdx) 6026; AVX512-FCP-NEXT: vmovdqa64 %ymm19, (%rcx) 6027; AVX512-FCP-NEXT: vmovdqa64 %ymm20, (%r8) 6028; AVX512-FCP-NEXT: vmovdqa64 %ymm21, (%r9) 6029; AVX512-FCP-NEXT: vmovdqa %ymm6, (%r10) 6030; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax) 6031; AVX512-FCP-NEXT: vzeroupper 6032; AVX512-FCP-NEXT: retq 6033; 6034; AVX512DQ-LABEL: load_i8_stride7_vf32: 6035; AVX512DQ: # %bb.0: 6036; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 6037; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 6038; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] 6039; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm2 6040; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm3 6041; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 6042; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm3 ^ ymm2)) 6043; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm4 6044; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] 6045; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] 6046; AVX512DQ-NEXT: vpor %xmm4, %xmm1, %xmm1 6047; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 6048; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm4 6049; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] 6050; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm6 6051; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm20 6052; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm5 6053; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] 6054; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] 6055; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 6056; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5,6],ymm6[7] 6057; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] 6058; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm6 6059; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm7 6060; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm1 6061; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm9 6062; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm7 ^ (ymm9 & (ymm6 ^ ymm7)) 6063; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm10 6064; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u,u,u,u,u] 6065; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,7,14],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[u,u,u,u,u,u] 6066; AVX512DQ-NEXT: vpor %xmm10, %xmm9, %xmm13 6067; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] 6068; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm9 6069; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm15 6070; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm9 ^ ymm1)) 6071; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm10 6072; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15] 6073; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] 6074; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm13 & mem) 6075; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] 6076; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm12 ^ (ymm16 & (ymm8 ^ ymm12)) 6077; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm18 6078; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm12 6079; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm2 ^ (ymm12 & (ymm3 ^ ymm2)) 6080; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,6,13],zero,zero,xmm12[2,9],zero,zero,zero,xmm12[u,u,u,u] 6081; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm12 6082; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[4,11],zero,zero,xmm12[0,7,14,u,u,u,u] 6083; AVX512DQ-NEXT: vpor %xmm13, %xmm12, %xmm12 6084; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 6085; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] 6086; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] 6087; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3] 6088; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 6089; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7] 6090; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] 6091; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm12 6092; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm7 ^ (ymm12 & (ymm6 ^ ymm7)) 6093; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm15 6094; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u] 6095; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[1,8,15],zero,zero,xmm12[4,11],zero,zero,xmm12[u,u,u,u,u,u,u] 6096; AVX512DQ-NEXT: vpor %xmm15, %xmm12, %xmm15 6097; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm12 6098; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm9 ^ (ymm12 & (ymm1 ^ ymm9)) 6099; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7,8,9],ymm10[10],ymm12[11,12,13],ymm10[14],ymm12[15] 6100; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] 6101; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 | (ymm15 & ~mem) 6102; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm8 ^ (ymm16 & (ymm12 ^ ymm8)) 6103; AVX512DQ-NEXT: vmovdqa64 %ymm12, %ymm19 6104; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm8 6105; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm8 & (ymm6 ^ ymm7)) 6106; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u] 6107; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm8 6108; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u] 6109; AVX512DQ-NEXT: vpor %xmm15, %xmm8, %xmm8 6110; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm15 6111; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm9 ^ (ymm15 & (ymm1 ^ ymm9)) 6112; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8,9,10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15] 6113; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6114; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615] 6115; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm8 & ymm17) 6116; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm3 ^ (ymm14 & (ymm2 ^ ymm3)) 6117; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm8 6118; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u] 6119; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] 6120; AVX512DQ-NEXT: vpor %xmm8, %xmm14, %xmm8 6121; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 6122; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm12 6123; AVX512DQ-NEXT: vpshufb %xmm12, %xmm5, %xmm14 6124; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] 6125; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] 6126; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 6127; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7] 6128; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm8[1,2,3,4,5,6,7],ymm15[8],ymm8[9,10,11,12,13,14,15] 6129; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] 6130; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm20 6131; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm8 6132; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3)) 6133; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm12 6134; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u] 6135; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u] 6136; AVX512DQ-NEXT: vpor %xmm12, %xmm8, %xmm8 6137; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 6138; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12] 6139; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero 6140; AVX512DQ-NEXT: vpor %xmm12, %xmm14, %xmm12 6141; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 6142; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] 6143; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8)) 6144; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm8 6145; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm8 & (ymm6 ^ ymm7)) 6146; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u] 6147; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm8 6148; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u] 6149; AVX512DQ-NEXT: vpor %xmm14, %xmm8, %xmm8 6150; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm14 6151; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm1 ^ ymm9)) 6152; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2],ymm10[3],ymm14[4,5,6],ymm10[7,8],ymm14[9,10],ymm10[11],ymm14[12,13,14],ymm10[15] 6153; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6154; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm8 & ymm17) 6155; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15] 6156; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] 6157; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm21 6158; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm8 6159; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3)) 6160; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u] 6161; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm8 6162; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u] 6163; AVX512DQ-NEXT: vpor %xmm12, %xmm8, %xmm8 6164; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 6165; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13] 6166; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero 6167; AVX512DQ-NEXT: vpor %xmm12, %xmm14, %xmm12 6168; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 6169; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8)) 6170; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm8 6171; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm7 ^ ymm6)) 6172; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm14 6173; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u] 6174; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u] 6175; AVX512DQ-NEXT: vpor %xmm14, %xmm8, %xmm8 6176; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm14 6177; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm1 ^ ymm9)) 6178; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2,3],ymm10[4],ymm14[5,6],ymm10[7,8],ymm14[9,10,11],ymm10[12],ymm14[13,14],ymm10[15] 6179; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6180; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm8 & ymm17) 6181; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15] 6182; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm8[4,5,6,7] 6183; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm8 6184; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3)) 6185; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u] 6186; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm8 6187; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u] 6188; AVX512DQ-NEXT: vpor %xmm12, %xmm8, %xmm8 6189; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 6190; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero 6191; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14] 6192; AVX512DQ-NEXT: vpor %xmm12, %xmm15, %xmm12 6193; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 6194; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8)) 6195; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm8 6196; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm7 ^ ymm6)) 6197; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm15 6198; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u] 6199; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u] 6200; AVX512DQ-NEXT: vpor %xmm15, %xmm8, %xmm8 6201; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm15 6202; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm9 ^ ymm1)) 6203; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6,7,8],ymm10[9],ymm15[10,11],ymm10[12],ymm15[13,14,15] 6204; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6205; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm8 & ymm17) 6206; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm12[1,2,3,4,5,6,7],ymm15[8],ymm12[9,10,11,12,13,14,15] 6207; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] 6208; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm2 ^ (ymm13 & (ymm3 ^ ymm2)) 6209; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm2 6210; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u] 6211; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u] 6212; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 6213; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 6214; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero 6215; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15] 6216; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 6217; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 6218; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm16 & (ymm3 ^ ymm2)) 6219; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm6 ^ (ymm11 & (ymm7 ^ ymm6)) 6220; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u] 6221; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm4 6222; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u] 6223; AVX512DQ-NEXT: vpor %xmm2, %xmm4, %xmm2 6224; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm9 ^ ymm1)) 6225; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7,8],ymm10[9],ymm0[10,11,12],ymm10[13],ymm0[14,15] 6226; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6227; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm17) 6228; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15] 6229; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6230; AVX512DQ-NEXT: vmovdqa64 %ymm18, (%rsi) 6231; AVX512DQ-NEXT: vmovdqa64 %ymm19, (%rdx) 6232; AVX512DQ-NEXT: vmovdqa64 %ymm20, (%rcx) 6233; AVX512DQ-NEXT: vmovdqa64 %ymm21, (%r8) 6234; AVX512DQ-NEXT: vmovdqa %ymm14, (%r9) 6235; AVX512DQ-NEXT: vmovdqa %ymm8, (%r10) 6236; AVX512DQ-NEXT: vmovdqa %ymm0, (%rax) 6237; AVX512DQ-NEXT: vzeroupper 6238; AVX512DQ-NEXT: retq 6239; 6240; AVX512DQ-FCP-LABEL: load_i8_stride7_vf32: 6241; AVX512DQ-FCP: # %bb.0: 6242; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 6243; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 6244; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] 6245; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 6246; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm3 6247; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 6248; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm3 ^ ymm2)) 6249; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 6250; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] 6251; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] 6252; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm1, %xmm1 6253; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 6254; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,2,4,6] 6255; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm12 6256; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm4, %ymm4 6257; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] 6258; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5,6],ymm4[7] 6259; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] 6260; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4 6261; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 6262; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 6263; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm7 6264; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm4 ^ ymm5)) 6265; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 6266; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u,u,u] 6267; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u,u,u] 6268; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm10 6269; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] 6270; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 6271; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm11 6272; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm1 ^ (ymm11 & (ymm7 ^ ymm1)) 6273; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm8 6274; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7,8,9],ymm8[10],ymm11[11,12],ymm8[13],ymm11[14,15] 6275; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] 6276; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm10 & mem) 6277; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] 6278; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm6 ^ (ymm16 & (ymm11 ^ ymm6)) 6279; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm18 6280; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm6 6281; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm3 ^ ymm2)) 6282; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u] 6283; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 6284; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u] 6285; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm10, %xmm6 6286; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 6287; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,3,4,6] 6288; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm10, %ymm10 6289; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] 6290; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] 6291; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] 6292; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm10 6293; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm10 & (ymm4 ^ ymm5)) 6294; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm14 6295; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u,u,u] 6296; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[1,8,15],zero,zero,xmm10[4,11],zero,zero,xmm10[u,u,u,u,u,u,u] 6297; AVX512DQ-FCP-NEXT: vpor %xmm14, %xmm10, %xmm14 6298; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm10 6299; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm7 ^ (ymm10 & (ymm1 ^ ymm7)) 6300; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm8[2],ymm10[3,4,5],ymm8[6],ymm10[7,8,9],ymm8[10],ymm10[11,12,13],ymm8[14],ymm10[15] 6301; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] 6302; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm14 & ~mem) 6303; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm16 & (ymm10 ^ ymm6)) 6304; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm6 6305; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5)) 6306; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u] 6307; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 6308; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u] 6309; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm14, %xmm6 6310; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm14 6311; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm7 ^ (ymm14 & (ymm1 ^ ymm7)) 6312; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7,8,9,10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15] 6313; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6314; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615] 6315; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm6 & ymm17) 6316; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm2 ^ ymm3)) 6317; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm6 6318; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u] 6319; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u] 6320; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm13, %xmm6 6321; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 6322; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,5,6] 6323; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12 6324; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] 6325; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7] 6326; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1,2,3,4,5,6,7],ymm14[8],ymm6[9,10,11,12,13,14,15] 6327; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5,6,7] 6328; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm19 6329; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm6 6330; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3)) 6331; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm12 6332; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u] 6333; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u] 6334; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm12, %xmm6 6335; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 6336; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm14 6337; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,12] 6338; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm12 6339; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero 6340; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13 6341; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 6342; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] 6343; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6)) 6344; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm6 6345; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5)) 6346; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u,u,u] 6347; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 6348; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u,u,u] 6349; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6 6350; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm15 6351; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm7 ^ (ymm15 & (ymm1 ^ ymm7)) 6352; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2],ymm8[3],ymm15[4,5,6],ymm8[7,8],ymm15[9,10],ymm8[11],ymm15[12,13,14],ymm8[15] 6353; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6354; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17) 6355; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15] 6356; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7] 6357; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm20 6358; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm6 6359; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3)) 6360; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u] 6361; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 6362; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u] 6363; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm13, %xmm6 6364; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 6365; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[6,13] 6366; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero 6367; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13 6368; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 6369; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6)) 6370; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm6 6371; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm4 ^ (ymm6 & (ymm5 ^ ymm4)) 6372; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15 6373; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u] 6374; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u] 6375; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6 6376; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm15 6377; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm7 ^ (ymm15 & (ymm1 ^ ymm7)) 6378; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2,3],ymm8[4],ymm15[5,6],ymm8[7,8],ymm15[9,10,11],ymm8[12],ymm15[13,14],ymm8[15] 6379; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6380; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17) 6381; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15] 6382; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7] 6383; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm21 6384; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm6 6385; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3)) 6386; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u] 6387; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 6388; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u] 6389; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6 6390; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 6391; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero 6392; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14] 6393; AVX512DQ-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 6394; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 6395; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6)) 6396; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm6 6397; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm4 ^ (ymm6 & (ymm5 ^ ymm4)) 6398; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15 6399; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u] 6400; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u] 6401; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6 6402; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm15 6403; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm7 ^ ymm1)) 6404; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm8[1],ymm15[2,3],ymm8[4],ymm15[5,6,7,8],ymm8[9],ymm15[10,11],ymm8[12],ymm15[13,14,15] 6405; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6406; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17) 6407; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15] 6408; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7] 6409; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm2 ^ (ymm11 & (ymm3 ^ ymm2)) 6410; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm2 6411; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u] 6412; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u] 6413; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 6414; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 6415; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero 6416; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15] 6417; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3 6418; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 6419; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm16 & (ymm3 ^ ymm2)) 6420; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm4 ^ (ymm9 & (ymm5 ^ ymm4)) 6421; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u] 6422; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm4 6423; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u] 6424; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2 6425; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm7 ^ ymm1)) 6426; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7,8],ymm8[9],ymm0[10,11,12],ymm8[13],ymm0[14,15] 6427; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 6428; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm17) 6429; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15] 6430; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6431; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, (%rsi) 6432; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, (%rdx) 6433; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, (%rcx) 6434; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, (%r8) 6435; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, (%r9) 6436; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%r10) 6437; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax) 6438; AVX512DQ-FCP-NEXT: vzeroupper 6439; AVX512DQ-FCP-NEXT: retq 6440; 6441; AVX512BW-LABEL: load_i8_stride7_vf32: 6442; AVX512BW: # %bb.0: 6443; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 6444; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 6445; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31] 6446; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 6447; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 6448; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31] 6449; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm4 6450; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0] 6451; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm5 6452; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0] 6453; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm11 6454; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15] 6455; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm12 6456; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31] 6457; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm10 6458; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31] 6459; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm6 6460; AVX512BW-NEXT: vmovdqa (%rdi), %ymm3 6461; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 6462; AVX512BW-NEXT: movw $-28382, %r11w # imm = 0x9122 6463; AVX512BW-NEXT: kmovd %r11d, %k5 6464; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm1 {%k5} 6465; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm7 6466; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u] 6467; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] 6468; AVX512BW-NEXT: vpor %xmm7, %xmm1, %xmm1 6469; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] 6470; AVX512BW-NEXT: movw $992, %r11w # imm = 0x3E0 6471; AVX512BW-NEXT: kmovd %r11d, %k1 6472; AVX512BW-NEXT: vmovdqu16 %ymm6, %ymm1 {%k1} 6473; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm7 6474; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm6 6475; AVX512BW-NEXT: movw $8772, %r11w # imm = 0x2244 6476; AVX512BW-NEXT: kmovd %r11d, %k1 6477; AVX512BW-NEXT: vpblendmw %ymm7, %ymm6, %ymm8 {%k1} 6478; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm9 6479; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u] 6480; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u] 6481; AVX512BW-NEXT: vpor %xmm9, %xmm8, %xmm8 6482; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm13 6483; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm8 6484; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] 6485; AVX512BW-NEXT: vpshufb %xmm14, %xmm8, %xmm15 6486; AVX512BW-NEXT: vmovdqa 208(%rdi), %xmm9 6487; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] 6488; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3] 6489; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 6490; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] 6491; AVX512BW-NEXT: movl $-524288, %edi # imm = 0xFFF80000 6492; AVX512BW-NEXT: kmovd %edi, %k4 6493; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm1 {%k4} 6494; AVX512BW-NEXT: movw $4644, %di # imm = 0x1224 6495; AVX512BW-NEXT: kmovd %edi, %k2 6496; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} 6497; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm15 6498; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u] 6499; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u] 6500; AVX512BW-NEXT: vpor %xmm15, %xmm13, %xmm13 6501; AVX512BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] 6502; AVX512BW-NEXT: movl $511, %edi # imm = 0x1FF 6503; AVX512BW-NEXT: kmovd %edi, %k3 6504; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k3} 6505; AVX512BW-NEXT: movw $9288, %di # imm = 0x2448 6506; AVX512BW-NEXT: kmovd %edi, %k3 6507; AVX512BW-NEXT: vpblendmw %ymm7, %ymm6, %ymm13 {%k3} 6508; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,6,13],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[u,u,u,u] 6509; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13 6510; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,xmm13[4,11],zero,zero,xmm13[0,7,14,u,u,u,u] 6511; AVX512BW-NEXT: vpor %xmm15, %xmm13, %xmm13 6512; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 6513; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] 6514; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] 6515; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3] 6516; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 6517; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] 6518; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k4} 6519; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k1} 6520; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u] 6521; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13 6522; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u] 6523; AVX512BW-NEXT: vpor %xmm15, %xmm13, %xmm13 6524; AVX512BW-NEXT: movl $261632, %edi # imm = 0x3FE00 6525; AVX512BW-NEXT: kmovd %edi, %k4 6526; AVX512BW-NEXT: vpshufb {{.*#+}} ymm13 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 6527; AVX512BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm12 {%k5} 6528; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm15 6529; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u] 6530; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] 6531; AVX512BW-NEXT: vpor %xmm15, %xmm12, %xmm12 6532; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 6533; AVX512BW-NEXT: vpshufb %xmm14, %xmm9, %xmm14 6534; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] 6535; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] 6536; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 6537; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7] 6538; AVX512BW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15] 6539; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] 6540; AVX512BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k2} 6541; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm14 6542; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u] 6543; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u] 6544; AVX512BW-NEXT: vpor %xmm14, %xmm13, %xmm13 6545; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 6546; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12] 6547; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero 6548; AVX512BW-NEXT: vpor %xmm14, %xmm15, %xmm14 6549; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 6550; AVX512BW-NEXT: movl $-134217728, %edi # imm = 0xF8000000 6551; AVX512BW-NEXT: kmovd %edi, %k5 6552; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} 6553; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k3} 6554; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] 6555; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm14 6556; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] 6557; AVX512BW-NEXT: vpor %xmm15, %xmm14, %xmm14 6558; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 6559; AVX512BW-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] 6560; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] 6561; AVX512BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k1} 6562; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u] 6563; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13 6564; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u] 6565; AVX512BW-NEXT: vpor %xmm14, %xmm13, %xmm13 6566; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 6567; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13] 6568; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero 6569; AVX512BW-NEXT: vpor %xmm14, %xmm15, %xmm14 6570; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 6571; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} 6572; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k2} 6573; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm15 6574; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u] 6575; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u] 6576; AVX512BW-NEXT: vpor %xmm15, %xmm14, %xmm14 6577; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 6578; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] 6579; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] 6580; AVX512BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k3} 6581; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u] 6582; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13 6583; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u] 6584; AVX512BW-NEXT: vpor %xmm14, %xmm13, %xmm13 6585; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 6586; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero 6587; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14] 6588; AVX512BW-NEXT: vpor %xmm14, %xmm15, %xmm14 6589; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 6590; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} 6591; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k1} 6592; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm15 6593; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u] 6594; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u] 6595; AVX512BW-NEXT: vpor %xmm15, %xmm14, %xmm14 6596; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 6597; AVX512BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] 6598; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7] 6599; AVX512BW-NEXT: vmovdqu16 %ymm7, %ymm6 {%k2} 6600; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7 6601; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u] 6602; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] 6603; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6 6604; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 6605; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero 6606; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15] 6607; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 6608; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 6609; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k5} 6610; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k3} 6611; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] 6612; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 6613; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] 6614; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 6615; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 6616; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15] 6617; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 6618; AVX512BW-NEXT: vmovdqa %ymm1, (%rsi) 6619; AVX512BW-NEXT: vmovdqa %ymm10, (%rdx) 6620; AVX512BW-NEXT: vmovdqa %ymm12, (%rcx) 6621; AVX512BW-NEXT: vmovdqa %ymm11, (%r8) 6622; AVX512BW-NEXT: vmovdqa %ymm5, (%r9) 6623; AVX512BW-NEXT: vmovdqa %ymm4, (%r10) 6624; AVX512BW-NEXT: vmovdqa %ymm0, (%rax) 6625; AVX512BW-NEXT: vzeroupper 6626; AVX512BW-NEXT: retq 6627; 6628; AVX512BW-FCP-LABEL: load_i8_stride7_vf32: 6629; AVX512BW-FCP: # %bb.0: 6630; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 6631; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 6632; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] 6633; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 6634; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 6635; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] 6636; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm4 6637; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] 6638; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm5 6639; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] 6640; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm9 6641; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] 6642; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm10 6643; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] 6644; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm8 6645; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] 6646; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm6 6647; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm3 6648; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 6649; AVX512BW-FCP-NEXT: movw $-28382, %r11w # imm = 0x9122 6650; AVX512BW-FCP-NEXT: kmovd %r11d, %k5 6651; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm1 {%k5} 6652; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm7 6653; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u] 6654; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] 6655; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm1, %xmm1 6656; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] 6657; AVX512BW-FCP-NEXT: movw $992, %r11w # imm = 0x3E0 6658; AVX512BW-FCP-NEXT: kmovd %r11d, %k1 6659; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm1 {%k1} 6660; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm7 6661; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm6 6662; AVX512BW-FCP-NEXT: movw $8772, %r11w # imm = 0x2244 6663; AVX512BW-FCP-NEXT: kmovd %r11d, %k1 6664; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k1} 6665; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 6666; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u] 6667; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u] 6668; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 6669; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 6670; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,2,4,6] 6671; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm13 6672; AVX512BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12 6673; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] 6674; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] 6675; AVX512BW-FCP-NEXT: movl $-524288, %r11d # imm = 0xFFF80000 6676; AVX512BW-FCP-NEXT: kmovd %r11d, %k4 6677; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm1 {%k4} 6678; AVX512BW-FCP-NEXT: movw $4644, %r11w # imm = 0x1224 6679; AVX512BW-FCP-NEXT: kmovd %r11d, %k2 6680; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm11 {%k2} 6681; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 6682; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u,u,u] 6683; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u,u,u] 6684; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 6685; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] 6686; AVX512BW-FCP-NEXT: movl $511, %r11d # imm = 0x1FF 6687; AVX512BW-FCP-NEXT: kmovd %r11d, %k3 6688; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm8 {%k3} 6689; AVX512BW-FCP-NEXT: movw $9288, %r11w # imm = 0x2448 6690; AVX512BW-FCP-NEXT: kmovd %r11d, %k3 6691; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k3} 6692; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u] 6693; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 6694; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u] 6695; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 6696; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 6697; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,4,6] 6698; AVX512BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12 6699; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] 6700; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] 6701; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm8 {%k4} 6702; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm11 {%k1} 6703; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u] 6704; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 6705; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u] 6706; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 6707; AVX512BW-FCP-NEXT: movl $261632, %r11d # imm = 0x3FE00 6708; AVX512BW-FCP-NEXT: kmovd %r11d, %k4 6709; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 {%k4} = ymm10[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 6710; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm10 {%k5} 6711; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12 6712; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[1,8,15,u,u,u,u] 6713; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u] 6714; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm10, %xmm10 6715; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 6716; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,5,6] 6717; AVX512BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12 6718; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] 6719; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7] 6720; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15] 6721; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 6722; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm11 {%k2} 6723; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 6724; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u] 6725; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u] 6726; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 6727; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm13 6728; AVX512BW-FCP-NEXT: vmovdqa 208(%rdi), %xmm11 6729; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[5,12] 6730; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm12 6731; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero 6732; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 6733; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 6734; AVX512BW-FCP-NEXT: movl $-134217728, %edi # imm = 0xF8000000 6735; AVX512BW-FCP-NEXT: kmovd %edi, %k5 6736; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} 6737; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k3} 6738; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] 6739; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 6740; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] 6741; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 6742; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 6743; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] 6744; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] 6745; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k1} 6746; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u] 6747; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 6748; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u] 6749; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 6750; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 6751; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[6,13] 6752; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero 6753; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 6754; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 6755; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} 6756; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k2} 6757; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 6758; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u] 6759; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u] 6760; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 6761; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 6762; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] 6763; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] 6764; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k3} 6765; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u] 6766; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 6767; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u] 6768; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 6769; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 6770; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero 6771; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[0,7,14] 6772; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 6773; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 6774; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} 6775; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k1} 6776; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 6777; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u] 6778; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u] 6779; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 6780; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 6781; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] 6782; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7] 6783; AVX512BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm6 {%k2} 6784; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 6785; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u] 6786; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] 6787; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 6788; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 6789; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero 6790; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[1,8,15] 6791; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm11, %xmm7 6792; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 6793; AVX512BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm6 {%k5} 6794; AVX512BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k3} 6795; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] 6796; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 6797; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] 6798; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 6799; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 6800; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15] 6801; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 6802; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%rsi) 6803; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%rdx) 6804; AVX512BW-FCP-NEXT: vmovdqa %ymm10, (%rcx) 6805; AVX512BW-FCP-NEXT: vmovdqa %ymm9, (%r8) 6806; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%r9) 6807; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%r10) 6808; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax) 6809; AVX512BW-FCP-NEXT: vzeroupper 6810; AVX512BW-FCP-NEXT: retq 6811; 6812; AVX512DQ-BW-LABEL: load_i8_stride7_vf32: 6813; AVX512DQ-BW: # %bb.0: 6814; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 6815; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 6816; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31] 6817; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 6818; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 6819; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31] 6820; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm4 6821; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0] 6822; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm5 6823; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0] 6824; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm11 6825; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15] 6826; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm12 6827; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31] 6828; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm10 6829; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31] 6830; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm6 6831; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm3 6832; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2 6833; AVX512DQ-BW-NEXT: movw $-28382, %r11w # imm = 0x9122 6834; AVX512DQ-BW-NEXT: kmovd %r11d, %k5 6835; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm1 {%k5} 6836; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm7 6837; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u] 6838; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] 6839; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm1, %xmm1 6840; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] 6841; AVX512DQ-BW-NEXT: movw $992, %r11w # imm = 0x3E0 6842; AVX512DQ-BW-NEXT: kmovd %r11d, %k1 6843; AVX512DQ-BW-NEXT: vmovdqu16 %ymm6, %ymm1 {%k1} 6844; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm7 6845; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm6 6846; AVX512DQ-BW-NEXT: movw $8772, %r11w # imm = 0x2244 6847; AVX512DQ-BW-NEXT: kmovd %r11d, %k1 6848; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm6, %ymm8 {%k1} 6849; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm8, %xmm9 6850; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u] 6851; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u] 6852; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm8, %xmm8 6853; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm13 6854; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %xmm8 6855; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] 6856; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm8, %xmm15 6857; AVX512DQ-BW-NEXT: vmovdqa 208(%rdi), %xmm9 6858; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] 6859; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3] 6860; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 6861; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] 6862; AVX512DQ-BW-NEXT: movl $-524288, %edi # imm = 0xFFF80000 6863; AVX512DQ-BW-NEXT: kmovd %edi, %k4 6864; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm1 {%k4} 6865; AVX512DQ-BW-NEXT: movw $4644, %di # imm = 0x1224 6866; AVX512DQ-BW-NEXT: kmovd %edi, %k2 6867; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} 6868; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm15 6869; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u] 6870; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u] 6871; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm13, %xmm13 6872; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] 6873; AVX512DQ-BW-NEXT: movl $511, %edi # imm = 0x1FF 6874; AVX512DQ-BW-NEXT: kmovd %edi, %k3 6875; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k3} 6876; AVX512DQ-BW-NEXT: movw $9288, %di # imm = 0x2448 6877; AVX512DQ-BW-NEXT: kmovd %edi, %k3 6878; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm6, %ymm13 {%k3} 6879; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,6,13],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[u,u,u,u] 6880; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm13 6881; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,xmm13[4,11],zero,zero,xmm13[0,7,14,u,u,u,u] 6882; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm13, %xmm13 6883; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 6884; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] 6885; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] 6886; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3] 6887; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 6888; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] 6889; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k4} 6890; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k1} 6891; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u] 6892; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm13 6893; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u] 6894; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm13, %xmm13 6895; AVX512DQ-BW-NEXT: movl $261632, %edi # imm = 0x3FE00 6896; AVX512DQ-BW-NEXT: kmovd %edi, %k4 6897; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm13 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 6898; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm12 {%k5} 6899; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm12, %xmm15 6900; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u] 6901; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] 6902; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm12, %xmm12 6903; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 6904; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm9, %xmm14 6905; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] 6906; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] 6907; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 6908; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7] 6909; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15] 6910; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] 6911; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k2} 6912; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm14 6913; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u] 6914; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u] 6915; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm13, %xmm13 6916; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 6917; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12] 6918; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero 6919; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm15, %xmm14 6920; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 6921; AVX512DQ-BW-NEXT: movl $-134217728, %edi # imm = 0xF8000000 6922; AVX512DQ-BW-NEXT: kmovd %edi, %k5 6923; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} 6924; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k3} 6925; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] 6926; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm14 6927; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] 6928; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm14, %xmm14 6929; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 6930; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] 6931; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] 6932; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k1} 6933; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u] 6934; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm13 6935; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u] 6936; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm13, %xmm13 6937; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 6938; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13] 6939; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero 6940; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm15, %xmm14 6941; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 6942; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} 6943; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k2} 6944; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm15 6945; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u] 6946; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u] 6947; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm14, %xmm14 6948; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 6949; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] 6950; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] 6951; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k3} 6952; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u] 6953; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm13 6954; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u] 6955; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm13, %xmm13 6956; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 6957; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero 6958; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14] 6959; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm15, %xmm14 6960; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 6961; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} 6962; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k1} 6963; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm15 6964; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u] 6965; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u] 6966; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm14, %xmm14 6967; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 6968; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] 6969; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7] 6970; AVX512DQ-BW-NEXT: vmovdqu16 %ymm7, %ymm6 {%k2} 6971; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm7 6972; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u] 6973; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] 6974; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm6, %xmm6 6975; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 6976; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero 6977; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15] 6978; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7 6979; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 6980; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k5} 6981; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k3} 6982; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] 6983; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm2 6984; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] 6985; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 6986; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 6987; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15] 6988; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 6989; AVX512DQ-BW-NEXT: vmovdqa %ymm1, (%rsi) 6990; AVX512DQ-BW-NEXT: vmovdqa %ymm10, (%rdx) 6991; AVX512DQ-BW-NEXT: vmovdqa %ymm12, (%rcx) 6992; AVX512DQ-BW-NEXT: vmovdqa %ymm11, (%r8) 6993; AVX512DQ-BW-NEXT: vmovdqa %ymm5, (%r9) 6994; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%r10) 6995; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax) 6996; AVX512DQ-BW-NEXT: vzeroupper 6997; AVX512DQ-BW-NEXT: retq 6998; 6999; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf32: 7000; AVX512DQ-BW-FCP: # %bb.0: 7001; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 7002; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 7003; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] 7004; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 7005; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 7006; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] 7007; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm4 7008; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] 7009; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm5 7010; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] 7011; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm9 7012; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] 7013; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm10 7014; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] 7015; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm8 7016; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] 7017; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm6 7018; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm3 7019; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 7020; AVX512DQ-BW-FCP-NEXT: movw $-28382, %r11w # imm = 0x9122 7021; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k5 7022; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm1 {%k5} 7023; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm7 7024; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u] 7025; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] 7026; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm1, %xmm1 7027; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] 7028; AVX512DQ-BW-FCP-NEXT: movw $992, %r11w # imm = 0x3E0 7029; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1 7030; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm1 {%k1} 7031; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm7 7032; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm6 7033; AVX512DQ-BW-FCP-NEXT: movw $8772, %r11w # imm = 0x2244 7034; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1 7035; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k1} 7036; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 7037; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u] 7038; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u] 7039; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 7040; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 7041; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,2,4,6] 7042; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm13 7043; AVX512DQ-BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12 7044; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] 7045; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] 7046; AVX512DQ-BW-FCP-NEXT: movl $-524288, %r11d # imm = 0xFFF80000 7047; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k4 7048; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm1 {%k4} 7049; AVX512DQ-BW-FCP-NEXT: movw $4644, %r11w # imm = 0x1224 7050; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k2 7051; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm11 {%k2} 7052; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 7053; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u,u,u] 7054; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u,u,u] 7055; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 7056; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] 7057; AVX512DQ-BW-FCP-NEXT: movl $511, %r11d # imm = 0x1FF 7058; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k3 7059; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm8 {%k3} 7060; AVX512DQ-BW-FCP-NEXT: movw $9288, %r11w # imm = 0x2448 7061; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k3 7062; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k3} 7063; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u] 7064; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 7065; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u] 7066; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 7067; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 7068; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,4,6] 7069; AVX512DQ-BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12 7070; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] 7071; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] 7072; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm8 {%k4} 7073; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm11 {%k1} 7074; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u] 7075; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 7076; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u] 7077; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 7078; AVX512DQ-BW-FCP-NEXT: movl $261632, %r11d # imm = 0x3FE00 7079; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k4 7080; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 {%k4} = ymm10[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 7081; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm10 {%k5} 7082; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12 7083; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[1,8,15,u,u,u,u] 7084; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u] 7085; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm10, %xmm10 7086; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 7087; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,5,6] 7088; AVX512DQ-BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12 7089; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] 7090; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7] 7091; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15] 7092; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 7093; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm11 {%k2} 7094; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 7095; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u] 7096; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u] 7097; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 7098; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm13 7099; AVX512DQ-BW-FCP-NEXT: vmovdqa 208(%rdi), %xmm11 7100; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[5,12] 7101; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm12 7102; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero 7103; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 7104; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 7105; AVX512DQ-BW-FCP-NEXT: movl $-134217728, %edi # imm = 0xF8000000 7106; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k5 7107; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} 7108; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k3} 7109; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] 7110; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 7111; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] 7112; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 7113; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 7114; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] 7115; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] 7116; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k1} 7117; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u] 7118; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 7119; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u] 7120; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 7121; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 7122; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[6,13] 7123; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero 7124; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 7125; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 7126; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} 7127; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k2} 7128; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 7129; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u] 7130; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u] 7131; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 7132; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 7133; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] 7134; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] 7135; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k3} 7136; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u] 7137; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 7138; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u] 7139; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 7140; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 7141; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero 7142; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[0,7,14] 7143; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 7144; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 7145; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} 7146; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k1} 7147; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 7148; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u] 7149; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u] 7150; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 7151; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 7152; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] 7153; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7] 7154; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm6 {%k2} 7155; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 7156; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u] 7157; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] 7158; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 7159; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 7160; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero 7161; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[1,8,15] 7162; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm11, %xmm7 7163; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 7164; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm6 {%k5} 7165; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k3} 7166; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] 7167; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 7168; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] 7169; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 7170; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 7171; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15] 7172; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 7173; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%rsi) 7174; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%rdx) 7175; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm10, (%rcx) 7176; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm9, (%r8) 7177; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%r9) 7178; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%r10) 7179; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax) 7180; AVX512DQ-BW-FCP-NEXT: vzeroupper 7181; AVX512DQ-BW-FCP-NEXT: retq 7182 %wide.vec = load <224 x i8>, ptr %in.vec, align 64 7183 %strided.vec0 = shufflevector <224 x i8> %wide.vec, <224 x i8> poison, <32 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105, i32 112, i32 119, i32 126, i32 133, i32 140, i32 147, i32 154, i32 161, i32 168, i32 175, i32 182, i32 189, i32 196, i32 203, i32 210, i32 217> 7184 %strided.vec1 = shufflevector <224 x i8> %wide.vec, <224 x i8> poison, <32 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106, i32 113, i32 120, i32 127, i32 134, i32 141, i32 148, i32 155, i32 162, i32 169, i32 176, i32 183, i32 190, i32 197, i32 204, i32 211, i32 218> 7185 %strided.vec2 = shufflevector <224 x i8> %wide.vec, <224 x i8> poison, <32 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107, i32 114, i32 121, i32 128, i32 135, i32 142, i32 149, i32 156, i32 163, i32 170, i32 177, i32 184, i32 191, i32 198, i32 205, i32 212, i32 219> 7186 %strided.vec3 = shufflevector <224 x i8> %wide.vec, <224 x i8> poison, <32 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108, i32 115, i32 122, i32 129, i32 136, i32 143, i32 150, i32 157, i32 164, i32 171, i32 178, i32 185, i32 192, i32 199, i32 206, i32 213, i32 220> 7187 %strided.vec4 = shufflevector <224 x i8> %wide.vec, <224 x i8> poison, <32 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109, i32 116, i32 123, i32 130, i32 137, i32 144, i32 151, i32 158, i32 165, i32 172, i32 179, i32 186, i32 193, i32 200, i32 207, i32 214, i32 221> 7188 %strided.vec5 = shufflevector <224 x i8> %wide.vec, <224 x i8> poison, <32 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110, i32 117, i32 124, i32 131, i32 138, i32 145, i32 152, i32 159, i32 166, i32 173, i32 180, i32 187, i32 194, i32 201, i32 208, i32 215, i32 222> 7189 %strided.vec6 = shufflevector <224 x i8> %wide.vec, <224 x i8> poison, <32 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111, i32 118, i32 125, i32 132, i32 139, i32 146, i32 153, i32 160, i32 167, i32 174, i32 181, i32 188, i32 195, i32 202, i32 209, i32 216, i32 223> 7190 store <32 x i8> %strided.vec0, ptr %out.vec0, align 64 7191 store <32 x i8> %strided.vec1, ptr %out.vec1, align 64 7192 store <32 x i8> %strided.vec2, ptr %out.vec2, align 64 7193 store <32 x i8> %strided.vec3, ptr %out.vec3, align 64 7194 store <32 x i8> %strided.vec4, ptr %out.vec4, align 64 7195 store <32 x i8> %strided.vec5, ptr %out.vec5, align 64 7196 store <32 x i8> %strided.vec6, ptr %out.vec6, align 64 7197 ret void 7198} 7199 7200define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { 7201; SSE-LABEL: load_i8_stride7_vf64: 7202; SSE: # %bb.0: 7203; SSE-NEXT: subq $1528, %rsp # imm = 0x5F8 7204; SSE-NEXT: movdqa 208(%rdi), %xmm12 7205; SSE-NEXT: movdqa 192(%rdi), %xmm5 7206; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7207; SSE-NEXT: movdqa 176(%rdi), %xmm8 7208; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7209; SSE-NEXT: movdqa 112(%rdi), %xmm4 7210; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7211; SSE-NEXT: movdqa 128(%rdi), %xmm3 7212; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7213; SSE-NEXT: movdqa 160(%rdi), %xmm6 7214; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7215; SSE-NEXT: movdqa 144(%rdi), %xmm1 7216; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7217; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535] 7218; SSE-NEXT: movdqa %xmm2, %xmm0 7219; SSE-NEXT: pandn %xmm1, %xmm0 7220; SSE-NEXT: movdqa %xmm6, %xmm1 7221; SSE-NEXT: pand %xmm2, %xmm1 7222; SSE-NEXT: movdqa %xmm2, %xmm7 7223; SSE-NEXT: por %xmm0, %xmm1 7224; SSE-NEXT: pxor %xmm6, %xmm6 7225; SSE-NEXT: movdqa %xmm1, %xmm0 7226; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] 7227; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] 7228; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 7229; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 7230; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] 7231; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,5,6] 7232; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] 7233; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] 7234; SSE-NEXT: packuswb %xmm0, %xmm2 7235; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] 7236; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,0,65535,65535,65535,0,65535] 7237; SSE-NEXT: movdqa %xmm11, %xmm1 7238; SSE-NEXT: pandn %xmm3, %xmm1 7239; SSE-NEXT: movdqa %xmm4, %xmm3 7240; SSE-NEXT: pand %xmm11, %xmm3 7241; SSE-NEXT: por %xmm1, %xmm3 7242; SSE-NEXT: movdqa %xmm3, %xmm1 7243; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] 7244; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,0,65535,0,65535] 7245; SSE-NEXT: movdqa %xmm10, %xmm4 7246; SSE-NEXT: pandn %xmm1, %xmm4 7247; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] 7248; SSE-NEXT: pand %xmm10, %xmm3 7249; SSE-NEXT: por %xmm4, %xmm3 7250; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3] 7251; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] 7252; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] 7253; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] 7254; SSE-NEXT: packuswb %xmm1, %xmm1 7255; SSE-NEXT: pand %xmm0, %xmm1 7256; SSE-NEXT: movdqa %xmm0, %xmm3 7257; SSE-NEXT: pandn %xmm2, %xmm3 7258; SSE-NEXT: por %xmm3, %xmm1 7259; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,65535,0,65535] 7260; SSE-NEXT: movdqa %xmm9, %xmm2 7261; SSE-NEXT: pandn %xmm8, %xmm2 7262; SSE-NEXT: movdqa %xmm5, %xmm3 7263; SSE-NEXT: pand %xmm9, %xmm3 7264; SSE-NEXT: por %xmm2, %xmm3 7265; SSE-NEXT: movdqa %xmm3, %xmm2 7266; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 7267; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] 7268; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] 7269; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] 7270; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] 7271; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 7272; SSE-NEXT: movdqa %xmm12, %xmm3 7273; SSE-NEXT: movdqa %xmm12, %xmm4 7274; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] 7275; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7276; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] 7277; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7278; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] 7279; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] 7280; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] 7281; SSE-NEXT: packuswb %xmm3, %xmm3 7282; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,65535,65535,0] 7283; SSE-NEXT: movdqa %xmm12, %xmm4 7284; SSE-NEXT: pandn %xmm3, %xmm4 7285; SSE-NEXT: packuswb %xmm2, %xmm2 7286; SSE-NEXT: pand %xmm12, %xmm2 7287; SSE-NEXT: por %xmm2, %xmm4 7288; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,0,0] 7289; SSE-NEXT: movdqa %xmm8, %xmm2 7290; SSE-NEXT: pandn %xmm4, %xmm2 7291; SSE-NEXT: pand %xmm8, %xmm1 7292; SSE-NEXT: por %xmm1, %xmm2 7293; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7294; SSE-NEXT: movdqa 256(%rdi), %xmm2 7295; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7296; SSE-NEXT: movdqa %xmm7, %xmm1 7297; SSE-NEXT: pandn %xmm2, %xmm1 7298; SSE-NEXT: movdqa 272(%rdi), %xmm2 7299; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill 7300; SSE-NEXT: pand %xmm7, %xmm2 7301; SSE-NEXT: por %xmm1, %xmm2 7302; SSE-NEXT: movdqa %xmm2, %xmm1 7303; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] 7304; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 7305; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 7306; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] 7307; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 7308; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] 7309; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] 7310; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] 7311; SSE-NEXT: packuswb %xmm1, %xmm2 7312; SSE-NEXT: movdqa %xmm0, %xmm3 7313; SSE-NEXT: pandn %xmm2, %xmm3 7314; SSE-NEXT: movdqa 240(%rdi), %xmm2 7315; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7316; SSE-NEXT: movdqa %xmm11, %xmm1 7317; SSE-NEXT: pandn %xmm2, %xmm1 7318; SSE-NEXT: movdqa 224(%rdi), %xmm2 7319; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7320; SSE-NEXT: pand %xmm11, %xmm2 7321; SSE-NEXT: por %xmm1, %xmm2 7322; SSE-NEXT: movdqa %xmm2, %xmm1 7323; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] 7324; SSE-NEXT: movdqa %xmm10, %xmm4 7325; SSE-NEXT: pandn %xmm1, %xmm4 7326; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 7327; SSE-NEXT: pand %xmm10, %xmm2 7328; SSE-NEXT: por %xmm4, %xmm2 7329; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3] 7330; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] 7331; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] 7332; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] 7333; SSE-NEXT: packuswb %xmm1, %xmm1 7334; SSE-NEXT: pand %xmm0, %xmm1 7335; SSE-NEXT: por %xmm3, %xmm1 7336; SSE-NEXT: movdqa 288(%rdi), %xmm3 7337; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7338; SSE-NEXT: movdqa %xmm9, %xmm2 7339; SSE-NEXT: pandn %xmm3, %xmm2 7340; SSE-NEXT: movdqa 304(%rdi), %xmm3 7341; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7342; SSE-NEXT: pand %xmm9, %xmm3 7343; SSE-NEXT: por %xmm2, %xmm3 7344; SSE-NEXT: movdqa %xmm3, %xmm2 7345; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 7346; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] 7347; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] 7348; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] 7349; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] 7350; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 7351; SSE-NEXT: movdqa 320(%rdi), %xmm3 7352; SSE-NEXT: movdqa %xmm3, %xmm4 7353; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] 7354; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7355; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] 7356; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7357; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] 7358; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] 7359; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] 7360; SSE-NEXT: packuswb %xmm3, %xmm3 7361; SSE-NEXT: movdqa %xmm12, %xmm4 7362; SSE-NEXT: pandn %xmm3, %xmm4 7363; SSE-NEXT: packuswb %xmm2, %xmm2 7364; SSE-NEXT: pand %xmm12, %xmm2 7365; SSE-NEXT: por %xmm2, %xmm4 7366; SSE-NEXT: movdqa %xmm8, %xmm2 7367; SSE-NEXT: pandn %xmm4, %xmm2 7368; SSE-NEXT: pand %xmm8, %xmm1 7369; SSE-NEXT: por %xmm1, %xmm2 7370; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7371; SSE-NEXT: movdqa 368(%rdi), %xmm2 7372; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7373; SSE-NEXT: movdqa %xmm7, %xmm1 7374; SSE-NEXT: pandn %xmm2, %xmm1 7375; SSE-NEXT: movdqa 384(%rdi), %xmm2 7376; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7377; SSE-NEXT: pand %xmm7, %xmm2 7378; SSE-NEXT: por %xmm1, %xmm2 7379; SSE-NEXT: movdqa %xmm2, %xmm1 7380; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] 7381; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 7382; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 7383; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] 7384; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 7385; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] 7386; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] 7387; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] 7388; SSE-NEXT: packuswb %xmm1, %xmm2 7389; SSE-NEXT: movdqa %xmm0, %xmm3 7390; SSE-NEXT: pandn %xmm2, %xmm3 7391; SSE-NEXT: movdqa 352(%rdi), %xmm2 7392; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7393; SSE-NEXT: movdqa %xmm11, %xmm1 7394; SSE-NEXT: pandn %xmm2, %xmm1 7395; SSE-NEXT: movdqa 336(%rdi), %xmm2 7396; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7397; SSE-NEXT: pand %xmm11, %xmm2 7398; SSE-NEXT: por %xmm1, %xmm2 7399; SSE-NEXT: movdqa %xmm2, %xmm1 7400; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] 7401; SSE-NEXT: movdqa %xmm10, %xmm4 7402; SSE-NEXT: pandn %xmm1, %xmm4 7403; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 7404; SSE-NEXT: pand %xmm10, %xmm2 7405; SSE-NEXT: por %xmm4, %xmm2 7406; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3] 7407; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] 7408; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] 7409; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] 7410; SSE-NEXT: packuswb %xmm1, %xmm1 7411; SSE-NEXT: pand %xmm0, %xmm1 7412; SSE-NEXT: por %xmm3, %xmm1 7413; SSE-NEXT: movdqa 400(%rdi), %xmm3 7414; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7415; SSE-NEXT: movdqa %xmm9, %xmm2 7416; SSE-NEXT: pandn %xmm3, %xmm2 7417; SSE-NEXT: movdqa 416(%rdi), %xmm14 7418; SSE-NEXT: movdqa %xmm14, %xmm3 7419; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7420; SSE-NEXT: pand %xmm9, %xmm3 7421; SSE-NEXT: por %xmm2, %xmm3 7422; SSE-NEXT: movdqa %xmm3, %xmm2 7423; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 7424; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] 7425; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] 7426; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] 7427; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] 7428; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 7429; SSE-NEXT: movdqa 432(%rdi), %xmm3 7430; SSE-NEXT: movdqa %xmm3, %xmm4 7431; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] 7432; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7433; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] 7434; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7435; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] 7436; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] 7437; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] 7438; SSE-NEXT: packuswb %xmm3, %xmm3 7439; SSE-NEXT: movdqa %xmm12, %xmm4 7440; SSE-NEXT: pandn %xmm3, %xmm4 7441; SSE-NEXT: packuswb %xmm2, %xmm2 7442; SSE-NEXT: pand %xmm12, %xmm2 7443; SSE-NEXT: por %xmm2, %xmm4 7444; SSE-NEXT: movdqa %xmm8, %xmm2 7445; SSE-NEXT: pandn %xmm4, %xmm2 7446; SSE-NEXT: pand %xmm8, %xmm1 7447; SSE-NEXT: por %xmm1, %xmm2 7448; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7449; SSE-NEXT: movdqa 32(%rdi), %xmm15 7450; SSE-NEXT: movdqa %xmm7, %xmm1 7451; SSE-NEXT: pandn %xmm15, %xmm1 7452; SSE-NEXT: movdqa 48(%rdi), %xmm2 7453; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7454; SSE-NEXT: pand %xmm7, %xmm2 7455; SSE-NEXT: por %xmm1, %xmm2 7456; SSE-NEXT: movdqa %xmm2, %xmm1 7457; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] 7458; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 7459; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 7460; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] 7461; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 7462; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] 7463; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] 7464; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] 7465; SSE-NEXT: packuswb %xmm1, %xmm2 7466; SSE-NEXT: movdqa 16(%rdi), %xmm3 7467; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7468; SSE-NEXT: movdqa %xmm11, %xmm1 7469; SSE-NEXT: pandn %xmm3, %xmm1 7470; SSE-NEXT: movdqa (%rdi), %xmm4 7471; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7472; SSE-NEXT: pand %xmm11, %xmm4 7473; SSE-NEXT: por %xmm1, %xmm4 7474; SSE-NEXT: movdqa %xmm4, %xmm1 7475; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] 7476; SSE-NEXT: movdqa %xmm10, %xmm5 7477; SSE-NEXT: pandn %xmm1, %xmm5 7478; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] 7479; SSE-NEXT: pand %xmm10, %xmm4 7480; SSE-NEXT: por %xmm5, %xmm4 7481; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,1,3] 7482; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] 7483; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] 7484; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] 7485; SSE-NEXT: packuswb %xmm1, %xmm1 7486; SSE-NEXT: pand %xmm0, %xmm1 7487; SSE-NEXT: pandn %xmm2, %xmm0 7488; SSE-NEXT: por %xmm0, %xmm1 7489; SSE-NEXT: movdqa 64(%rdi), %xmm2 7490; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7491; SSE-NEXT: movdqa %xmm9, %xmm0 7492; SSE-NEXT: pandn %xmm2, %xmm0 7493; SSE-NEXT: movdqa 80(%rdi), %xmm2 7494; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7495; SSE-NEXT: pand %xmm9, %xmm2 7496; SSE-NEXT: por %xmm0, %xmm2 7497; SSE-NEXT: movdqa %xmm2, %xmm0 7498; SSE-NEXT: pxor %xmm5, %xmm5 7499; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] 7500; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] 7501; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] 7502; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 7503; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] 7504; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 7505; SSE-NEXT: movdqa 96(%rdi), %xmm2 7506; SSE-NEXT: movdqa %xmm2, %xmm3 7507; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] 7508; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7509; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] 7510; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7511; SSE-NEXT: pxor %xmm7, %xmm7 7512; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 7513; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 7514; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] 7515; SSE-NEXT: packuswb %xmm2, %xmm2 7516; SSE-NEXT: movdqa %xmm12, %xmm4 7517; SSE-NEXT: pandn %xmm2, %xmm4 7518; SSE-NEXT: packuswb %xmm0, %xmm0 7519; SSE-NEXT: pand %xmm12, %xmm0 7520; SSE-NEXT: por %xmm0, %xmm4 7521; SSE-NEXT: pand %xmm8, %xmm1 7522; SSE-NEXT: pandn %xmm4, %xmm8 7523; SSE-NEXT: por %xmm1, %xmm8 7524; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7525; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,0,65535,65535] 7526; SSE-NEXT: movdqa %xmm2, %xmm0 7527; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7528; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7529; SSE-NEXT: pand %xmm2, %xmm1 7530; SSE-NEXT: movdqa %xmm2, %xmm13 7531; SSE-NEXT: por %xmm0, %xmm1 7532; SSE-NEXT: movdqa %xmm1, %xmm2 7533; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] 7534; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] 7535; SSE-NEXT: movdqa %xmm0, %xmm4 7536; SSE-NEXT: pandn %xmm2, %xmm4 7537; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] 7538; SSE-NEXT: pand %xmm0, %xmm1 7539; SSE-NEXT: por %xmm4, %xmm1 7540; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] 7541; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] 7542; SSE-NEXT: psrld $16, %xmm2 7543; SSE-NEXT: packuswb %xmm2, %xmm1 7544; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] 7545; SSE-NEXT: movdqa %xmm4, %xmm2 7546; SSE-NEXT: movdqa %xmm4, %xmm8 7547; SSE-NEXT: pandn %xmm1, %xmm2 7548; SSE-NEXT: movdqa %xmm9, %xmm1 7549; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7550; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 7551; SSE-NEXT: pand %xmm9, %xmm4 7552; SSE-NEXT: por %xmm1, %xmm4 7553; SSE-NEXT: movdqa %xmm4, %xmm1 7554; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] 7555; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,0,65535] 7556; SSE-NEXT: movdqa %xmm6, %xmm5 7557; SSE-NEXT: pandn %xmm1, %xmm5 7558; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] 7559; SSE-NEXT: pand %xmm6, %xmm4 7560; SSE-NEXT: por %xmm5, %xmm4 7561; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3] 7562; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 7563; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 7564; SSE-NEXT: packuswb %xmm1, %xmm1 7565; SSE-NEXT: pand %xmm8, %xmm1 7566; SSE-NEXT: por %xmm2, %xmm1 7567; SSE-NEXT: movdqa %xmm11, %xmm2 7568; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 7569; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 7570; SSE-NEXT: pand %xmm11, %xmm4 7571; SSE-NEXT: por %xmm2, %xmm4 7572; SSE-NEXT: movdqa %xmm4, %xmm2 7573; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] 7574; SSE-NEXT: movdqa %xmm10, %xmm5 7575; SSE-NEXT: pandn %xmm2, %xmm5 7576; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] 7577; SSE-NEXT: pand %xmm10, %xmm4 7578; SSE-NEXT: por %xmm5, %xmm4 7579; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7580; SSE-NEXT: pslld $16, %xmm2 7581; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 7582; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] 7583; SSE-NEXT: packuswb %xmm5, %xmm2 7584; SSE-NEXT: movdqa %xmm12, %xmm5 7585; SSE-NEXT: pandn %xmm2, %xmm5 7586; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] 7587; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] 7588; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] 7589; SSE-NEXT: packuswb %xmm2, %xmm2 7590; SSE-NEXT: pand %xmm12, %xmm2 7591; SSE-NEXT: por %xmm2, %xmm5 7592; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] 7593; SSE-NEXT: movdqa %xmm4, %xmm2 7594; SSE-NEXT: pandn %xmm5, %xmm2 7595; SSE-NEXT: pand %xmm4, %xmm1 7596; SSE-NEXT: movdqa %xmm4, %xmm3 7597; SSE-NEXT: por %xmm1, %xmm2 7598; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7599; SSE-NEXT: movdqa %xmm13, %xmm1 7600; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7601; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload 7602; SSE-NEXT: pand %xmm13, %xmm2 7603; SSE-NEXT: por %xmm1, %xmm2 7604; SSE-NEXT: movdqa %xmm2, %xmm1 7605; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] 7606; SSE-NEXT: movdqa %xmm0, %xmm4 7607; SSE-NEXT: pandn %xmm1, %xmm4 7608; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] 7609; SSE-NEXT: pand %xmm0, %xmm2 7610; SSE-NEXT: por %xmm4, %xmm2 7611; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 7612; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] 7613; SSE-NEXT: psrld $16, %xmm1 7614; SSE-NEXT: packuswb %xmm1, %xmm2 7615; SSE-NEXT: movdqa %xmm8, %xmm4 7616; SSE-NEXT: pandn %xmm2, %xmm4 7617; SSE-NEXT: movdqa %xmm9, %xmm1 7618; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7619; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7620; SSE-NEXT: pand %xmm9, %xmm2 7621; SSE-NEXT: por %xmm1, %xmm2 7622; SSE-NEXT: movdqa %xmm2, %xmm1 7623; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] 7624; SSE-NEXT: movdqa %xmm6, %xmm5 7625; SSE-NEXT: pandn %xmm1, %xmm5 7626; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] 7627; SSE-NEXT: pand %xmm6, %xmm2 7628; SSE-NEXT: por %xmm5, %xmm2 7629; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] 7630; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 7631; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 7632; SSE-NEXT: packuswb %xmm1, %xmm1 7633; SSE-NEXT: pand %xmm8, %xmm1 7634; SSE-NEXT: por %xmm4, %xmm1 7635; SSE-NEXT: movdqa %xmm11, %xmm2 7636; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 7637; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 7638; SSE-NEXT: pand %xmm11, %xmm4 7639; SSE-NEXT: por %xmm2, %xmm4 7640; SSE-NEXT: movdqa %xmm4, %xmm2 7641; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] 7642; SSE-NEXT: movdqa %xmm10, %xmm5 7643; SSE-NEXT: pandn %xmm2, %xmm5 7644; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] 7645; SSE-NEXT: pand %xmm10, %xmm4 7646; SSE-NEXT: por %xmm5, %xmm4 7647; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7648; SSE-NEXT: pslld $16, %xmm2 7649; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 7650; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] 7651; SSE-NEXT: packuswb %xmm5, %xmm2 7652; SSE-NEXT: movdqa %xmm12, %xmm5 7653; SSE-NEXT: pandn %xmm2, %xmm5 7654; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] 7655; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] 7656; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] 7657; SSE-NEXT: packuswb %xmm2, %xmm2 7658; SSE-NEXT: pand %xmm12, %xmm2 7659; SSE-NEXT: por %xmm2, %xmm5 7660; SSE-NEXT: movdqa %xmm3, %xmm2 7661; SSE-NEXT: pandn %xmm5, %xmm2 7662; SSE-NEXT: pand %xmm3, %xmm1 7663; SSE-NEXT: por %xmm1, %xmm2 7664; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7665; SSE-NEXT: movdqa %xmm13, %xmm1 7666; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7667; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7668; SSE-NEXT: pand %xmm13, %xmm2 7669; SSE-NEXT: por %xmm1, %xmm2 7670; SSE-NEXT: movdqa %xmm2, %xmm1 7671; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] 7672; SSE-NEXT: movdqa %xmm0, %xmm4 7673; SSE-NEXT: pandn %xmm1, %xmm4 7674; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] 7675; SSE-NEXT: pand %xmm0, %xmm2 7676; SSE-NEXT: por %xmm4, %xmm2 7677; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 7678; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] 7679; SSE-NEXT: psrld $16, %xmm1 7680; SSE-NEXT: packuswb %xmm1, %xmm2 7681; SSE-NEXT: movdqa %xmm8, %xmm4 7682; SSE-NEXT: pandn %xmm2, %xmm4 7683; SSE-NEXT: movdqa %xmm9, %xmm1 7684; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 7685; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7686; SSE-NEXT: pand %xmm9, %xmm2 7687; SSE-NEXT: por %xmm1, %xmm2 7688; SSE-NEXT: movdqa %xmm2, %xmm1 7689; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] 7690; SSE-NEXT: movdqa %xmm6, %xmm5 7691; SSE-NEXT: pandn %xmm1, %xmm5 7692; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] 7693; SSE-NEXT: pand %xmm6, %xmm2 7694; SSE-NEXT: por %xmm5, %xmm2 7695; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] 7696; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 7697; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 7698; SSE-NEXT: packuswb %xmm1, %xmm1 7699; SSE-NEXT: pand %xmm8, %xmm1 7700; SSE-NEXT: por %xmm4, %xmm1 7701; SSE-NEXT: movdqa %xmm11, %xmm2 7702; SSE-NEXT: pandn %xmm14, %xmm2 7703; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 7704; SSE-NEXT: pand %xmm11, %xmm4 7705; SSE-NEXT: por %xmm2, %xmm4 7706; SSE-NEXT: movdqa %xmm4, %xmm2 7707; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] 7708; SSE-NEXT: movdqa %xmm10, %xmm5 7709; SSE-NEXT: pandn %xmm2, %xmm5 7710; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] 7711; SSE-NEXT: pand %xmm10, %xmm4 7712; SSE-NEXT: por %xmm5, %xmm4 7713; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7714; SSE-NEXT: pslld $16, %xmm2 7715; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 7716; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] 7717; SSE-NEXT: packuswb %xmm5, %xmm2 7718; SSE-NEXT: movdqa %xmm12, %xmm5 7719; SSE-NEXT: pandn %xmm2, %xmm5 7720; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] 7721; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7] 7722; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] 7723; SSE-NEXT: packuswb %xmm2, %xmm2 7724; SSE-NEXT: pand %xmm12, %xmm2 7725; SSE-NEXT: por %xmm2, %xmm5 7726; SSE-NEXT: movdqa %xmm3, %xmm2 7727; SSE-NEXT: pandn %xmm5, %xmm2 7728; SSE-NEXT: pand %xmm3, %xmm1 7729; SSE-NEXT: por %xmm1, %xmm2 7730; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7731; SSE-NEXT: movdqa %xmm13, %xmm1 7732; SSE-NEXT: pandn %xmm15, %xmm1 7733; SSE-NEXT: movdqa %xmm15, %xmm3 7734; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 7735; SSE-NEXT: movdqa %xmm15, %xmm2 7736; SSE-NEXT: pand %xmm13, %xmm2 7737; SSE-NEXT: por %xmm1, %xmm2 7738; SSE-NEXT: movdqa %xmm2, %xmm1 7739; SSE-NEXT: pxor %xmm4, %xmm4 7740; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] 7741; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 7742; SSE-NEXT: pxor %xmm5, %xmm5 7743; SSE-NEXT: pand %xmm0, %xmm2 7744; SSE-NEXT: pandn %xmm1, %xmm0 7745; SSE-NEXT: por %xmm2, %xmm0 7746; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] 7747; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,6] 7748; SSE-NEXT: psrld $16, %xmm1 7749; SSE-NEXT: packuswb %xmm1, %xmm0 7750; SSE-NEXT: movdqa %xmm8, %xmm1 7751; SSE-NEXT: pandn %xmm0, %xmm1 7752; SSE-NEXT: movdqa %xmm9, %xmm0 7753; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 7754; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7755; SSE-NEXT: pand %xmm9, %xmm2 7756; SSE-NEXT: movdqa %xmm9, %xmm12 7757; SSE-NEXT: por %xmm0, %xmm2 7758; SSE-NEXT: movdqa %xmm2, %xmm0 7759; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] 7760; SSE-NEXT: movdqa %xmm6, %xmm4 7761; SSE-NEXT: pandn %xmm0, %xmm4 7762; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] 7763; SSE-NEXT: pand %xmm6, %xmm2 7764; SSE-NEXT: por %xmm4, %xmm2 7765; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3] 7766; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 7767; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] 7768; SSE-NEXT: packuswb %xmm0, %xmm0 7769; SSE-NEXT: pand %xmm8, %xmm0 7770; SSE-NEXT: por %xmm1, %xmm0 7771; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7772; SSE-NEXT: movdqa %xmm11, %xmm0 7773; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7774; SSE-NEXT: pandn %xmm14, %xmm0 7775; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7776; SSE-NEXT: pand %xmm11, %xmm1 7777; SSE-NEXT: por %xmm0, %xmm1 7778; SSE-NEXT: movdqa %xmm1, %xmm0 7779; SSE-NEXT: pxor %xmm2, %xmm2 7780; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 7781; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 7782; SSE-NEXT: pand %xmm10, %xmm1 7783; SSE-NEXT: pandn %xmm0, %xmm10 7784; SSE-NEXT: por %xmm1, %xmm10 7785; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7786; SSE-NEXT: movdqa %xmm11, %xmm0 7787; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 7788; SSE-NEXT: pandn %xmm7, %xmm0 7789; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7790; SSE-NEXT: movdqa %xmm2, %xmm1 7791; SSE-NEXT: pand %xmm11, %xmm1 7792; SSE-NEXT: por %xmm0, %xmm1 7793; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7794; SSE-NEXT: movdqa %xmm11, %xmm0 7795; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload 7796; SSE-NEXT: pandn %xmm8, %xmm0 7797; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 7798; SSE-NEXT: movdqa %xmm5, %xmm1 7799; SSE-NEXT: pand %xmm11, %xmm1 7800; SSE-NEXT: por %xmm0, %xmm1 7801; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7802; SSE-NEXT: movdqa %xmm11, %xmm0 7803; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 7804; SSE-NEXT: pandn %xmm9, %xmm0 7805; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7806; SSE-NEXT: movdqa %xmm1, %xmm4 7807; SSE-NEXT: pand %xmm11, %xmm4 7808; SSE-NEXT: por %xmm0, %xmm4 7809; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7810; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7811; SSE-NEXT: movdqa %xmm3, %xmm0 7812; SSE-NEXT: pand %xmm11, %xmm0 7813; SSE-NEXT: movdqa %xmm15, %xmm6 7814; SSE-NEXT: pandn %xmm15, %xmm11 7815; SSE-NEXT: por %xmm0, %xmm11 7816; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7817; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,0,65535,65535,0,65535,65535,65535] 7818; SSE-NEXT: movdqa %xmm15, %xmm0 7819; SSE-NEXT: pandn %xmm2, %xmm0 7820; SSE-NEXT: movdqa %xmm12, %xmm2 7821; SSE-NEXT: movdqa %xmm7, %xmm4 7822; SSE-NEXT: pandn %xmm7, %xmm2 7823; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7824; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] 7825; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7826; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7827; SSE-NEXT: pand %xmm15, %xmm4 7828; SSE-NEXT: por %xmm0, %xmm4 7829; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7830; SSE-NEXT: movdqa %xmm15, %xmm0 7831; SSE-NEXT: pandn %xmm5, %xmm0 7832; SSE-NEXT: movdqa %xmm12, %xmm2 7833; SSE-NEXT: movdqa %xmm8, %xmm4 7834; SSE-NEXT: pandn %xmm8, %xmm2 7835; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7836; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] 7837; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7838; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7839; SSE-NEXT: pand %xmm15, %xmm4 7840; SSE-NEXT: por %xmm0, %xmm4 7841; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill 7842; SSE-NEXT: movdqa %xmm15, %xmm0 7843; SSE-NEXT: pandn %xmm1, %xmm0 7844; SSE-NEXT: movdqa %xmm12, %xmm2 7845; SSE-NEXT: movdqa %xmm12, %xmm1 7846; SSE-NEXT: movdqa %xmm9, %xmm4 7847; SSE-NEXT: pandn %xmm9, %xmm1 7848; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7849; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,2,2,3] 7850; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7851; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7852; SSE-NEXT: pand %xmm15, %xmm4 7853; SSE-NEXT: por %xmm0, %xmm4 7854; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7855; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 7856; SSE-NEXT: pand %xmm15, %xmm9 7857; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 7858; SSE-NEXT: pand %xmm15, %xmm12 7859; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7860; SSE-NEXT: pand %xmm15, %xmm0 7861; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7862; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7863; SSE-NEXT: pand %xmm15, %xmm0 7864; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7865; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7866; SSE-NEXT: pand %xmm15, %xmm0 7867; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7868; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7869; SSE-NEXT: pand %xmm15, %xmm0 7870; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7871; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7872; SSE-NEXT: pand %xmm15, %xmm0 7873; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7874; SSE-NEXT: movdqa %xmm14, %xmm0 7875; SSE-NEXT: pand %xmm15, %xmm0 7876; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7877; SSE-NEXT: movdqa %xmm2, %xmm4 7878; SSE-NEXT: movdqa %xmm6, %xmm0 7879; SSE-NEXT: pandn %xmm6, %xmm4 7880; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7881; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,2,3] 7882; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7883; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7884; SSE-NEXT: pand %xmm15, %xmm0 7885; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7886; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7887; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7888; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7889; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7890; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7891; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7892; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7893; SSE-NEXT: pandn %xmm3, %xmm15 7894; SSE-NEXT: por %xmm0, %xmm15 7895; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7896; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] 7897; SSE-NEXT: movdqa %xmm1, %xmm2 7898; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 7899; SSE-NEXT: pandn %xmm7, %xmm2 7900; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7901; SSE-NEXT: movdqa %xmm7, %xmm10 7902; SSE-NEXT: movdqa %xmm7, %xmm4 7903; SSE-NEXT: movdqa %xmm1, %xmm2 7904; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 7905; SSE-NEXT: pandn %xmm6, %xmm2 7906; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7907; SSE-NEXT: movdqa %xmm6, %xmm8 7908; SSE-NEXT: movdqa %xmm1, %xmm2 7909; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 7910; SSE-NEXT: pandn %xmm5, %xmm2 7911; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7912; SSE-NEXT: movdqa %xmm5, %xmm1 7913; SSE-NEXT: movdqa %xmm5, %xmm11 7914; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 7915; SSE-NEXT: movdqa %xmm3, %xmm13 7916; SSE-NEXT: pslld $16, %xmm13 7917; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 7918; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 7919; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] 7920; SSE-NEXT: movdqa %xmm6, %xmm0 7921; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 7922; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 7923; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] 7924; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7925; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 7926; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7927; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 7928; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7929; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7930; SSE-NEXT: movdqa %xmm1, %xmm2 7931; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero 7932; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 7933; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7934; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] 7935; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7936; SSE-NEXT: movdqa %xmm14, %xmm4 7937; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] 7938; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7939; SSE-NEXT: movdqa %xmm15, %xmm8 7940; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] 7941; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7942; SSE-NEXT: movdqa %xmm0, %xmm11 7943; SSE-NEXT: movdqa %xmm1, %xmm0 7944; SSE-NEXT: movdqa %xmm1, %xmm2 7945; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 7946; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7947; SSE-NEXT: movdqa %xmm3, %xmm1 7948; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 7949; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7950; SSE-NEXT: pxor %xmm0, %xmm0 7951; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 7952; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,7,5,6,7] 7953; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,2,0] 7954; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,6,4,6,5] 7955; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,65535,0,65535] 7956; SSE-NEXT: pand %xmm14, %xmm3 7957; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7958; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 7959; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7960; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 7961; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm3[0,1,2,3,7,5,6,7] 7962; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] 7963; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,6,4,6,5] 7964; SSE-NEXT: pand %xmm14, %xmm3 7965; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7966; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 7967; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7968; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] 7969; SSE-NEXT: pxor %xmm3, %xmm3 7970; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,5,6,7] 7971; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] 7972; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,6,4,6,5] 7973; SSE-NEXT: pand %xmm14, %xmm0 7974; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7975; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 7976; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7977; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 7978; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm0[0,1,2,3,7,5,6,7] 7979; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] 7980; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,6,4,6,5] 7981; SSE-NEXT: movdqa %xmm14, %xmm0 7982; SSE-NEXT: pand %xmm14, %xmm3 7983; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7984; SSE-NEXT: movdqa %xmm14, %xmm3 7985; SSE-NEXT: pandn %xmm4, %xmm3 7986; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7987; SSE-NEXT: pand %xmm14, %xmm7 7988; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7989; SSE-NEXT: movdqa %xmm14, %xmm3 7990; SSE-NEXT: pandn %xmm8, %xmm3 7991; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7992; SSE-NEXT: pand %xmm14, %xmm6 7993; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7994; SSE-NEXT: movdqa %xmm14, %xmm3 7995; SSE-NEXT: movdqa %xmm11, %xmm6 7996; SSE-NEXT: pandn %xmm11, %xmm3 7997; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7998; SSE-NEXT: pand %xmm14, %xmm5 7999; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8000; SSE-NEXT: movdqa %xmm2, %xmm3 8001; SSE-NEXT: pand %xmm14, %xmm3 8002; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8003; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8004; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8005; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8006; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8007; SSE-NEXT: pandn %xmm1, %xmm0 8008; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8009; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8010; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8011; SSE-NEXT: pxor %xmm0, %xmm0 8012; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 8013; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535] 8014; SSE-NEXT: pand %xmm11, %xmm3 8015; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8016; SSE-NEXT: pand %xmm11, %xmm4 8017; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8018; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload 8019; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8020; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 8021; SSE-NEXT: pand %xmm11, %xmm3 8022; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill 8023; SSE-NEXT: pand %xmm11, %xmm8 8024; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8025; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8026; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8027; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 8028; SSE-NEXT: pxor %xmm8, %xmm8 8029; SSE-NEXT: pand %xmm11, %xmm3 8030; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8031; SSE-NEXT: pand %xmm11, %xmm6 8032; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8033; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8034; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8035; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] 8036; SSE-NEXT: pand %xmm11, %xmm0 8037; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8038; SSE-NEXT: pand %xmm11, %xmm1 8039; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8040; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8041; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8042; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8043; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8044; SSE-NEXT: pandn %xmm2, %xmm11 8045; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8046; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] 8047; SSE-NEXT: packuswb %xmm2, %xmm3 8048; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,65535,0] 8049; SSE-NEXT: movdqa %xmm0, %xmm15 8050; SSE-NEXT: pandn %xmm3, %xmm15 8051; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 8052; SSE-NEXT: # xmm3 = mem[0,3,2,3] 8053; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7] 8054; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,6,7] 8055; SSE-NEXT: packuswb %xmm3, %xmm3 8056; SSE-NEXT: pand %xmm0, %xmm3 8057; SSE-NEXT: movdqa %xmm0, %xmm4 8058; SSE-NEXT: por %xmm3, %xmm15 8059; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] 8060; SSE-NEXT: movdqa %xmm0, %xmm3 8061; SSE-NEXT: pandn %xmm15, %xmm3 8062; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8063; SSE-NEXT: pand %xmm0, %xmm2 8064; SSE-NEXT: movdqa %xmm0, %xmm13 8065; SSE-NEXT: por %xmm2, %xmm3 8066; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8067; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,0,65535] 8068; SSE-NEXT: movdqa %xmm0, %xmm3 8069; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 8070; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 8071; SSE-NEXT: pand %xmm0, %xmm15 8072; SSE-NEXT: movdqa %xmm0, %xmm5 8073; SSE-NEXT: por %xmm3, %xmm15 8074; SSE-NEXT: movdqa %xmm15, %xmm3 8075; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] 8076; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,65535,65535,0,65535] 8077; SSE-NEXT: movdqa %xmm14, %xmm0 8078; SSE-NEXT: pandn %xmm3, %xmm0 8079; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] 8080; SSE-NEXT: pand %xmm14, %xmm15 8081; SSE-NEXT: por %xmm0, %xmm15 8082; SSE-NEXT: packuswb %xmm10, %xmm0 8083; SSE-NEXT: movdqa %xmm4, %xmm2 8084; SSE-NEXT: pandn %xmm0, %xmm2 8085; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,3,2,3] 8086; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] 8087; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] 8088; SSE-NEXT: packuswb %xmm0, %xmm0 8089; SSE-NEXT: pand %xmm4, %xmm0 8090; SSE-NEXT: por %xmm0, %xmm2 8091; SSE-NEXT: movdqa %xmm13, %xmm3 8092; SSE-NEXT: pandn %xmm2, %xmm3 8093; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8094; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 8095; SSE-NEXT: por %xmm0, %xmm9 8096; SSE-NEXT: movdqa %xmm9, %xmm0 8097; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] 8098; SSE-NEXT: movdqa %xmm14, %xmm2 8099; SSE-NEXT: pandn %xmm0, %xmm2 8100; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] 8101; SSE-NEXT: pand %xmm14, %xmm9 8102; SSE-NEXT: por %xmm2, %xmm9 8103; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 8104; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,3,2,3] 8105; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8106; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 8107; SSE-NEXT: movdqa %xmm1, %xmm2 8108; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] 8109; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] 8110; SSE-NEXT: movdqa %xmm0, %xmm15 8111; SSE-NEXT: pandn %xmm2, %xmm15 8112; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 8113; SSE-NEXT: pand %xmm0, %xmm1 8114; SSE-NEXT: por %xmm15, %xmm1 8115; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,1,2,1] 8116; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7] 8117; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] 8118; SSE-NEXT: packuswb %xmm2, %xmm11 8119; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] 8120; SSE-NEXT: movdqa %xmm6, %xmm2 8121; SSE-NEXT: pandn %xmm11, %xmm2 8122; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,2,1,3] 8123; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] 8124; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] 8125; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] 8126; SSE-NEXT: packuswb %xmm1, %xmm1 8127; SSE-NEXT: pand %xmm6, %xmm1 8128; SSE-NEXT: por %xmm1, %xmm2 8129; SSE-NEXT: pand %xmm13, %xmm2 8130; SSE-NEXT: por %xmm3, %xmm2 8131; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8132; SSE-NEXT: movdqa %xmm5, %xmm15 8133; SSE-NEXT: movdqa %xmm5, %xmm1 8134; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 8135; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8136; SSE-NEXT: pand %xmm5, %xmm2 8137; SSE-NEXT: por %xmm1, %xmm2 8138; SSE-NEXT: movdqa %xmm2, %xmm1 8139; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 8140; SSE-NEXT: movdqa %xmm14, %xmm3 8141; SSE-NEXT: pandn %xmm1, %xmm3 8142; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] 8143; SSE-NEXT: pand %xmm14, %xmm2 8144; SSE-NEXT: por %xmm3, %xmm2 8145; SSE-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 8146; SSE-NEXT: movdqa %xmm4, %xmm3 8147; SSE-NEXT: pandn %xmm1, %xmm3 8148; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] 8149; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] 8150; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 8151; SSE-NEXT: packuswb %xmm1, %xmm1 8152; SSE-NEXT: pand %xmm4, %xmm1 8153; SSE-NEXT: movdqa %xmm4, %xmm10 8154; SSE-NEXT: por %xmm1, %xmm3 8155; SSE-NEXT: movdqa %xmm13, %xmm1 8156; SSE-NEXT: pandn %xmm3, %xmm1 8157; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8158; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 8159; SSE-NEXT: por %xmm2, %xmm12 8160; SSE-NEXT: movdqa %xmm12, %xmm2 8161; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] 8162; SSE-NEXT: movdqa %xmm14, %xmm3 8163; SSE-NEXT: pandn %xmm2, %xmm3 8164; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] 8165; SSE-NEXT: pand %xmm14, %xmm12 8166; SSE-NEXT: por %xmm3, %xmm12 8167; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 8168; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,3,2,3] 8169; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 8170; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] 8171; SSE-NEXT: movdqa %xmm4, %xmm2 8172; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] 8173; SSE-NEXT: movdqa %xmm0, %xmm3 8174; SSE-NEXT: pandn %xmm2, %xmm3 8175; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] 8176; SSE-NEXT: pand %xmm0, %xmm4 8177; SSE-NEXT: por %xmm3, %xmm4 8178; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,2,1] 8179; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] 8180; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] 8181; SSE-NEXT: packuswb %xmm2, %xmm3 8182; SSE-NEXT: movdqa %xmm6, %xmm4 8183; SSE-NEXT: pandn %xmm3, %xmm4 8184; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,2,1,3] 8185; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] 8186; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,3] 8187; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] 8188; SSE-NEXT: packuswb %xmm2, %xmm2 8189; SSE-NEXT: pand %xmm6, %xmm2 8190; SSE-NEXT: por %xmm2, %xmm4 8191; SSE-NEXT: pand %xmm13, %xmm4 8192; SSE-NEXT: por %xmm1, %xmm4 8193; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8194; SSE-NEXT: movdqa %xmm5, %xmm1 8195; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 8196; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8197; SSE-NEXT: pand %xmm5, %xmm2 8198; SSE-NEXT: por %xmm1, %xmm2 8199; SSE-NEXT: movdqa %xmm2, %xmm1 8200; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 8201; SSE-NEXT: movdqa %xmm14, %xmm3 8202; SSE-NEXT: pandn %xmm1, %xmm3 8203; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] 8204; SSE-NEXT: pand %xmm14, %xmm2 8205; SSE-NEXT: por %xmm3, %xmm2 8206; SSE-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 8207; SSE-NEXT: movdqa %xmm10, %xmm3 8208; SSE-NEXT: pandn %xmm1, %xmm3 8209; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] 8210; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] 8211; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 8212; SSE-NEXT: packuswb %xmm1, %xmm1 8213; SSE-NEXT: pand %xmm10, %xmm1 8214; SSE-NEXT: por %xmm1, %xmm3 8215; SSE-NEXT: movdqa %xmm13, %xmm1 8216; SSE-NEXT: pandn %xmm3, %xmm1 8217; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8218; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 8219; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 8220; SSE-NEXT: por %xmm2, %xmm4 8221; SSE-NEXT: movdqa %xmm4, %xmm2 8222; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] 8223; SSE-NEXT: movdqa %xmm14, %xmm3 8224; SSE-NEXT: pandn %xmm2, %xmm3 8225; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] 8226; SSE-NEXT: pand %xmm14, %xmm4 8227; SSE-NEXT: por %xmm3, %xmm4 8228; SSE-NEXT: movdqa %xmm4, %xmm5 8229; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 8230; SSE-NEXT: # xmm2 = mem[1,3,2,3] 8231; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 8232; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] 8233; SSE-NEXT: movdqa %xmm4, %xmm2 8234; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] 8235; SSE-NEXT: movdqa %xmm0, %xmm3 8236; SSE-NEXT: pandn %xmm2, %xmm3 8237; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] 8238; SSE-NEXT: pand %xmm0, %xmm4 8239; SSE-NEXT: por %xmm3, %xmm4 8240; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,2,1] 8241; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] 8242; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] 8243; SSE-NEXT: packuswb %xmm2, %xmm3 8244; SSE-NEXT: movdqa %xmm6, %xmm4 8245; SSE-NEXT: pandn %xmm3, %xmm4 8246; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,1,3] 8247; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] 8248; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,3] 8249; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] 8250; SSE-NEXT: packuswb %xmm2, %xmm2 8251; SSE-NEXT: pand %xmm6, %xmm2 8252; SSE-NEXT: por %xmm2, %xmm4 8253; SSE-NEXT: pand %xmm13, %xmm4 8254; SSE-NEXT: por %xmm1, %xmm4 8255; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8256; SSE-NEXT: movdqa %xmm15, %xmm1 8257; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 8258; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8259; SSE-NEXT: pand %xmm15, %xmm2 8260; SSE-NEXT: por %xmm1, %xmm2 8261; SSE-NEXT: movdqa %xmm2, %xmm1 8262; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 8263; SSE-NEXT: movdqa %xmm14, %xmm3 8264; SSE-NEXT: pandn %xmm1, %xmm3 8265; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] 8266; SSE-NEXT: pand %xmm14, %xmm2 8267; SSE-NEXT: por %xmm3, %xmm2 8268; SSE-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 8269; SSE-NEXT: movdqa %xmm10, %xmm3 8270; SSE-NEXT: pandn %xmm1, %xmm3 8271; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] 8272; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] 8273; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 8274; SSE-NEXT: packuswb %xmm1, %xmm1 8275; SSE-NEXT: pand %xmm10, %xmm1 8276; SSE-NEXT: por %xmm1, %xmm3 8277; SSE-NEXT: movdqa %xmm13, %xmm1 8278; SSE-NEXT: pandn %xmm3, %xmm1 8279; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8280; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 8281; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 8282; SSE-NEXT: por %xmm2, %xmm4 8283; SSE-NEXT: movdqa %xmm4, %xmm2 8284; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] 8285; SSE-NEXT: movdqa %xmm14, %xmm3 8286; SSE-NEXT: pandn %xmm2, %xmm3 8287; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] 8288; SSE-NEXT: pand %xmm14, %xmm4 8289; SSE-NEXT: por %xmm3, %xmm4 8290; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 8291; SSE-NEXT: # xmm2 = mem[1,3,2,3] 8292; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8293; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 8294; SSE-NEXT: movdqa %xmm3, %xmm2 8295; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] 8296; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] 8297; SSE-NEXT: pand %xmm0, %xmm3 8298; SSE-NEXT: pandn %xmm2, %xmm0 8299; SSE-NEXT: por %xmm3, %xmm0 8300; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] 8301; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] 8302; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] 8303; SSE-NEXT: packuswb %xmm2, %xmm0 8304; SSE-NEXT: movdqa %xmm6, %xmm2 8305; SSE-NEXT: pandn %xmm0, %xmm2 8306; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,1,3] 8307; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] 8308; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,3] 8309; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] 8310; SSE-NEXT: packuswb %xmm0, %xmm0 8311; SSE-NEXT: pand %xmm6, %xmm0 8312; SSE-NEXT: por %xmm0, %xmm2 8313; SSE-NEXT: pand %xmm13, %xmm2 8314; SSE-NEXT: por %xmm1, %xmm2 8315; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8316; SSE-NEXT: movdqa %xmm15, %xmm9 8317; SSE-NEXT: movdqa %xmm15, %xmm0 8318; SSE-NEXT: pandn %xmm7, %xmm0 8319; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8320; SSE-NEXT: pand %xmm15, %xmm1 8321; SSE-NEXT: por %xmm0, %xmm1 8322; SSE-NEXT: movdqa %xmm1, %xmm0 8323; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] 8324; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 8325; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 8326; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] 8327; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 8328; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] 8329; SSE-NEXT: psrlq $48, %xmm0 8330; SSE-NEXT: packuswb %xmm0, %xmm1 8331; SSE-NEXT: movdqa %xmm6, %xmm0 8332; SSE-NEXT: pandn %xmm1, %xmm0 8333; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,65535,0,65535,65535] 8334; SSE-NEXT: movdqa %xmm12, %xmm1 8335; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 8336; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8337; SSE-NEXT: pand %xmm12, %xmm2 8338; SSE-NEXT: por %xmm1, %xmm2 8339; SSE-NEXT: movdqa %xmm2, %xmm1 8340; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 8341; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,0,65535,65535,65535,65535] 8342; SSE-NEXT: movdqa %xmm5, %xmm3 8343; SSE-NEXT: pandn %xmm1, %xmm3 8344; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] 8345; SSE-NEXT: pand %xmm5, %xmm2 8346; SSE-NEXT: por %xmm3, %xmm2 8347; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,2,1,0,4,5,6,7] 8348; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] 8349; SSE-NEXT: packuswb %xmm1, %xmm1 8350; SSE-NEXT: pand %xmm6, %xmm1 8351; SSE-NEXT: por %xmm0, %xmm1 8352; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8353; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 8354; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8355; SSE-NEXT: por %xmm0, %xmm3 8356; SSE-NEXT: movdqa %xmm3, %xmm0 8357; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] 8358; SSE-NEXT: movdqa %xmm14, %xmm2 8359; SSE-NEXT: pandn %xmm0, %xmm2 8360; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] 8361; SSE-NEXT: pand %xmm14, %xmm3 8362; SSE-NEXT: por %xmm2, %xmm3 8363; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 8364; SSE-NEXT: # xmm0 = mem[2,1,2,3,4,5,6,7] 8365; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 8366; SSE-NEXT: packuswb %xmm0, %xmm0 8367; SSE-NEXT: movdqa %xmm10, %xmm7 8368; SSE-NEXT: movdqa %xmm10, %xmm2 8369; SSE-NEXT: pandn %xmm0, %xmm2 8370; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,1,0,4,5,6,7] 8371; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] 8372; SSE-NEXT: packuswb %xmm0, %xmm0 8373; SSE-NEXT: pand %xmm10, %xmm0 8374; SSE-NEXT: por %xmm0, %xmm2 8375; SSE-NEXT: movdqa %xmm13, %xmm10 8376; SSE-NEXT: movdqa %xmm13, %xmm0 8377; SSE-NEXT: pandn %xmm2, %xmm0 8378; SSE-NEXT: pand %xmm13, %xmm1 8379; SSE-NEXT: por %xmm1, %xmm0 8380; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8381; SSE-NEXT: movdqa %xmm15, %xmm0 8382; SSE-NEXT: pandn %xmm11, %xmm0 8383; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8384; SSE-NEXT: pand %xmm15, %xmm1 8385; SSE-NEXT: por %xmm0, %xmm1 8386; SSE-NEXT: movdqa %xmm1, %xmm0 8387; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] 8388; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 8389; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 8390; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] 8391; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 8392; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] 8393; SSE-NEXT: psrlq $48, %xmm0 8394; SSE-NEXT: packuswb %xmm0, %xmm1 8395; SSE-NEXT: movdqa %xmm6, %xmm0 8396; SSE-NEXT: pandn %xmm1, %xmm0 8397; SSE-NEXT: movdqa %xmm12, %xmm1 8398; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 8399; SSE-NEXT: pandn %xmm15, %xmm1 8400; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 8401; SSE-NEXT: movdqa %xmm4, %xmm2 8402; SSE-NEXT: pand %xmm12, %xmm2 8403; SSE-NEXT: por %xmm1, %xmm2 8404; SSE-NEXT: movdqa %xmm2, %xmm1 8405; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 8406; SSE-NEXT: movdqa %xmm5, %xmm3 8407; SSE-NEXT: pandn %xmm1, %xmm3 8408; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] 8409; SSE-NEXT: pand %xmm5, %xmm2 8410; SSE-NEXT: por %xmm3, %xmm2 8411; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,2,1,0,4,5,6,7] 8412; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] 8413; SSE-NEXT: packuswb %xmm1, %xmm1 8414; SSE-NEXT: pand %xmm6, %xmm1 8415; SSE-NEXT: por %xmm0, %xmm1 8416; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8417; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 8418; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8419; SSE-NEXT: por %xmm0, %xmm3 8420; SSE-NEXT: movdqa %xmm3, %xmm0 8421; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] 8422; SSE-NEXT: movdqa %xmm14, %xmm2 8423; SSE-NEXT: pandn %xmm0, %xmm2 8424; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] 8425; SSE-NEXT: pand %xmm14, %xmm3 8426; SSE-NEXT: por %xmm2, %xmm3 8427; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 8428; SSE-NEXT: # xmm0 = mem[2,1,2,3,4,5,6,7] 8429; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 8430; SSE-NEXT: packuswb %xmm0, %xmm0 8431; SSE-NEXT: movdqa %xmm7, %xmm2 8432; SSE-NEXT: pandn %xmm0, %xmm2 8433; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,1,0,4,5,6,7] 8434; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] 8435; SSE-NEXT: packuswb %xmm0, %xmm0 8436; SSE-NEXT: pand %xmm7, %xmm0 8437; SSE-NEXT: por %xmm0, %xmm2 8438; SSE-NEXT: movdqa %xmm13, %xmm0 8439; SSE-NEXT: pandn %xmm2, %xmm0 8440; SSE-NEXT: pand %xmm13, %xmm1 8441; SSE-NEXT: por %xmm1, %xmm0 8442; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8443; SSE-NEXT: movdqa %xmm9, %xmm0 8444; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 8445; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8446; SSE-NEXT: pand %xmm9, %xmm1 8447; SSE-NEXT: movdqa %xmm9, %xmm13 8448; SSE-NEXT: por %xmm0, %xmm1 8449; SSE-NEXT: movdqa %xmm1, %xmm0 8450; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] 8451; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 8452; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 8453; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] 8454; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 8455; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] 8456; SSE-NEXT: psrlq $48, %xmm0 8457; SSE-NEXT: packuswb %xmm0, %xmm1 8458; SSE-NEXT: movdqa %xmm6, %xmm0 8459; SSE-NEXT: pandn %xmm1, %xmm0 8460; SSE-NEXT: movdqa %xmm12, %xmm1 8461; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 8462; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 8463; SSE-NEXT: movdqa %xmm9, %xmm2 8464; SSE-NEXT: pand %xmm12, %xmm2 8465; SSE-NEXT: por %xmm1, %xmm2 8466; SSE-NEXT: movdqa %xmm2, %xmm1 8467; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 8468; SSE-NEXT: movdqa %xmm5, %xmm3 8469; SSE-NEXT: pandn %xmm1, %xmm3 8470; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] 8471; SSE-NEXT: pand %xmm5, %xmm2 8472; SSE-NEXT: por %xmm3, %xmm2 8473; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,2,1,0,4,5,6,7] 8474; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] 8475; SSE-NEXT: packuswb %xmm1, %xmm1 8476; SSE-NEXT: pand %xmm6, %xmm1 8477; SSE-NEXT: por %xmm0, %xmm1 8478; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 8479; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8480; SSE-NEXT: pandn %xmm11, %xmm0 8481; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8482; SSE-NEXT: por %xmm0, %xmm3 8483; SSE-NEXT: movdqa %xmm3, %xmm0 8484; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] 8485; SSE-NEXT: movdqa %xmm14, %xmm2 8486; SSE-NEXT: pandn %xmm0, %xmm2 8487; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] 8488; SSE-NEXT: pand %xmm14, %xmm3 8489; SSE-NEXT: por %xmm2, %xmm3 8490; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 8491; SSE-NEXT: # xmm0 = mem[2,1,2,3,4,5,6,7] 8492; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 8493; SSE-NEXT: packuswb %xmm0, %xmm0 8494; SSE-NEXT: movdqa %xmm7, %xmm2 8495; SSE-NEXT: pandn %xmm0, %xmm2 8496; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,1,0,4,5,6,7] 8497; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] 8498; SSE-NEXT: packuswb %xmm0, %xmm0 8499; SSE-NEXT: pand %xmm7, %xmm0 8500; SSE-NEXT: por %xmm0, %xmm2 8501; SSE-NEXT: movdqa %xmm10, %xmm0 8502; SSE-NEXT: pandn %xmm2, %xmm0 8503; SSE-NEXT: pand %xmm10, %xmm1 8504; SSE-NEXT: por %xmm1, %xmm0 8505; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8506; SSE-NEXT: movdqa %xmm13, %xmm0 8507; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 8508; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8509; SSE-NEXT: pand %xmm13, %xmm2 8510; SSE-NEXT: por %xmm0, %xmm2 8511; SSE-NEXT: movdqa %xmm2, %xmm0 8512; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] 8513; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] 8514; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 8515; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] 8516; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 8517; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6] 8518; SSE-NEXT: psrlq $48, %xmm0 8519; SSE-NEXT: packuswb %xmm0, %xmm1 8520; SSE-NEXT: movdqa %xmm12, %xmm0 8521; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 8522; SSE-NEXT: pandn %xmm13, %xmm0 8523; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8524; SSE-NEXT: pand %xmm12, %xmm2 8525; SSE-NEXT: por %xmm0, %xmm2 8526; SSE-NEXT: movdqa %xmm2, %xmm0 8527; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] 8528; SSE-NEXT: movdqa %xmm5, %xmm3 8529; SSE-NEXT: pandn %xmm0, %xmm3 8530; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] 8531; SSE-NEXT: pand %xmm5, %xmm2 8532; SSE-NEXT: por %xmm3, %xmm2 8533; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,2,1,0,4,5,6,7] 8534; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] 8535; SSE-NEXT: packuswb %xmm0, %xmm0 8536; SSE-NEXT: pand %xmm6, %xmm0 8537; SSE-NEXT: pandn %xmm1, %xmm6 8538; SSE-NEXT: por %xmm6, %xmm0 8539; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8540; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 8541; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8542; SSE-NEXT: por %xmm1, %xmm3 8543; SSE-NEXT: movdqa %xmm3, %xmm1 8544; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] 8545; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] 8546; SSE-NEXT: pand %xmm14, %xmm3 8547; SSE-NEXT: pandn %xmm1, %xmm14 8548; SSE-NEXT: por %xmm3, %xmm14 8549; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 8550; SSE-NEXT: # xmm1 = mem[2,1,2,3,4,5,6,7] 8551; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 8552; SSE-NEXT: packuswb %xmm1, %xmm1 8553; SSE-NEXT: movdqa %xmm7, %xmm2 8554; SSE-NEXT: pandn %xmm1, %xmm2 8555; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[0,2,1,0,4,5,6,7] 8556; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7] 8557; SSE-NEXT: packuswb %xmm1, %xmm1 8558; SSE-NEXT: pand %xmm7, %xmm1 8559; SSE-NEXT: por %xmm1, %xmm2 8560; SSE-NEXT: movdqa %xmm10, %xmm1 8561; SSE-NEXT: pandn %xmm2, %xmm1 8562; SSE-NEXT: pand %xmm10, %xmm0 8563; SSE-NEXT: por %xmm0, %xmm1 8564; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8565; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,0,65535,65535] 8566; SSE-NEXT: movdqa %xmm8, %xmm0 8567; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 8568; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8569; SSE-NEXT: pand %xmm8, %xmm1 8570; SSE-NEXT: por %xmm0, %xmm1 8571; SSE-NEXT: movdqa %xmm1, %xmm0 8572; SSE-NEXT: pxor %xmm2, %xmm2 8573; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 8574; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 8575; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 8576; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] 8577; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 8578; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 8579; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8580; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 8581; SSE-NEXT: pxor %xmm6, %xmm6 8582; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8583; SSE-NEXT: pandn %xmm0, %xmm3 8584; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8585; SSE-NEXT: por %xmm3, %xmm2 8586; SSE-NEXT: packuswb %xmm0, %xmm2 8587; SSE-NEXT: packuswb %xmm1, %xmm1 8588; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,3,3] 8589; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 8590; SSE-NEXT: movdqa %xmm12, %xmm1 8591; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 8592; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 8593; SSE-NEXT: movdqa %xmm14, %xmm2 8594; SSE-NEXT: pand %xmm12, %xmm2 8595; SSE-NEXT: por %xmm1, %xmm2 8596; SSE-NEXT: movdqa %xmm2, %xmm1 8597; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] 8598; SSE-NEXT: movdqa %xmm5, %xmm3 8599; SSE-NEXT: pandn %xmm1, %xmm3 8600; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] 8601; SSE-NEXT: pand %xmm5, %xmm2 8602; SSE-NEXT: por %xmm3, %xmm2 8603; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 8604; SSE-NEXT: # xmm1 = mem[0,1,2,1] 8605; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] 8606; SSE-NEXT: packuswb %xmm1, %xmm1 8607; SSE-NEXT: movdqa %xmm7, %xmm3 8608; SSE-NEXT: pandn %xmm1, %xmm3 8609; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] 8610; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] 8611; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] 8612; SSE-NEXT: packuswb %xmm1, %xmm1 8613; SSE-NEXT: pand %xmm7, %xmm1 8614; SSE-NEXT: por %xmm1, %xmm3 8615; SSE-NEXT: movdqa %xmm10, %xmm1 8616; SSE-NEXT: pandn %xmm3, %xmm1 8617; SSE-NEXT: andps %xmm10, %xmm0 8618; SSE-NEXT: por %xmm0, %xmm1 8619; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8620; SSE-NEXT: movdqa %xmm8, %xmm0 8621; SSE-NEXT: pandn %xmm15, %xmm0 8622; SSE-NEXT: pand %xmm8, %xmm4 8623; SSE-NEXT: por %xmm0, %xmm4 8624; SSE-NEXT: movdqa %xmm4, %xmm0 8625; SSE-NEXT: pxor %xmm1, %xmm1 8626; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 8627; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 8628; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] 8629; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,1,2,3] 8630; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] 8631; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 8632; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8633; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 8634; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 8635; SSE-NEXT: pandn %xmm0, %xmm4 8636; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8637; SSE-NEXT: por %xmm4, %xmm3 8638; SSE-NEXT: packuswb %xmm0, %xmm3 8639; SSE-NEXT: packuswb %xmm2, %xmm2 8640; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,3,3] 8641; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] 8642; SSE-NEXT: movdqa %xmm12, %xmm2 8643; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 8644; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 8645; SSE-NEXT: movdqa %xmm15, %xmm3 8646; SSE-NEXT: pand %xmm12, %xmm3 8647; SSE-NEXT: por %xmm2, %xmm3 8648; SSE-NEXT: movdqa %xmm3, %xmm2 8649; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 8650; SSE-NEXT: movdqa %xmm5, %xmm4 8651; SSE-NEXT: pandn %xmm2, %xmm4 8652; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] 8653; SSE-NEXT: pand %xmm5, %xmm3 8654; SSE-NEXT: por %xmm4, %xmm3 8655; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 8656; SSE-NEXT: # xmm2 = mem[0,1,2,1] 8657; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] 8658; SSE-NEXT: packuswb %xmm2, %xmm2 8659; SSE-NEXT: movdqa %xmm7, %xmm4 8660; SSE-NEXT: pandn %xmm2, %xmm4 8661; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] 8662; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] 8663; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] 8664; SSE-NEXT: packuswb %xmm2, %xmm2 8665; SSE-NEXT: pand %xmm7, %xmm2 8666; SSE-NEXT: por %xmm2, %xmm4 8667; SSE-NEXT: movdqa %xmm10, %xmm1 8668; SSE-NEXT: pandn %xmm4, %xmm1 8669; SSE-NEXT: andps %xmm10, %xmm0 8670; SSE-NEXT: por %xmm0, %xmm1 8671; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8672; SSE-NEXT: movdqa %xmm8, %xmm0 8673; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 8674; SSE-NEXT: pand %xmm8, %xmm9 8675; SSE-NEXT: por %xmm0, %xmm9 8676; SSE-NEXT: movdqa %xmm9, %xmm0 8677; SSE-NEXT: pxor %xmm1, %xmm1 8678; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 8679; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 8680; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] 8681; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,1,2,3] 8682; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] 8683; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] 8684; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8685; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 8686; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 8687; SSE-NEXT: pandn %xmm0, %xmm4 8688; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8689; SSE-NEXT: por %xmm4, %xmm2 8690; SSE-NEXT: packuswb %xmm0, %xmm2 8691; SSE-NEXT: packuswb %xmm3, %xmm3 8692; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,3,3] 8693; SSE-NEXT: movss {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3] 8694; SSE-NEXT: movdqa %xmm12, %xmm3 8695; SSE-NEXT: pandn %xmm11, %xmm3 8696; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8697; SSE-NEXT: movdqa %xmm2, %xmm4 8698; SSE-NEXT: pand %xmm12, %xmm4 8699; SSE-NEXT: por %xmm3, %xmm4 8700; SSE-NEXT: movdqa %xmm4, %xmm3 8701; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 8702; SSE-NEXT: movdqa %xmm5, %xmm6 8703; SSE-NEXT: pandn %xmm3, %xmm6 8704; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 8705; SSE-NEXT: pand %xmm5, %xmm4 8706; SSE-NEXT: por %xmm6, %xmm4 8707; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 8708; SSE-NEXT: # xmm3 = mem[0,1,2,1] 8709; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] 8710; SSE-NEXT: packuswb %xmm3, %xmm3 8711; SSE-NEXT: movdqa %xmm7, %xmm6 8712; SSE-NEXT: pandn %xmm3, %xmm6 8713; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,0,3] 8714; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] 8715; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] 8716; SSE-NEXT: packuswb %xmm3, %xmm3 8717; SSE-NEXT: pand %xmm7, %xmm3 8718; SSE-NEXT: por %xmm3, %xmm6 8719; SSE-NEXT: movdqa %xmm10, %xmm1 8720; SSE-NEXT: pandn %xmm6, %xmm1 8721; SSE-NEXT: andps %xmm10, %xmm0 8722; SSE-NEXT: por %xmm0, %xmm1 8723; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8724; SSE-NEXT: movdqa %xmm8, %xmm0 8725; SSE-NEXT: pandn %xmm13, %xmm0 8726; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 8727; SSE-NEXT: pand %xmm8, %xmm4 8728; SSE-NEXT: por %xmm0, %xmm4 8729; SSE-NEXT: movdqa %xmm4, %xmm0 8730; SSE-NEXT: pxor %xmm1, %xmm1 8731; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 8732; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 8733; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] 8734; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] 8735; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] 8736; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] 8737; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 8738; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 8739; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 8740; SSE-NEXT: pandn %xmm0, %xmm6 8741; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8742; SSE-NEXT: por %xmm6, %xmm3 8743; SSE-NEXT: packuswb %xmm0, %xmm3 8744; SSE-NEXT: packuswb %xmm4, %xmm4 8745; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,3,3] 8746; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] 8747; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 8748; SSE-NEXT: movdqa %xmm12, %xmm3 8749; SSE-NEXT: pand %xmm12, %xmm4 8750; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 8751; SSE-NEXT: por %xmm4, %xmm3 8752; SSE-NEXT: movdqa %xmm3, %xmm4 8753; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] 8754; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] 8755; SSE-NEXT: pxor %xmm12, %xmm12 8756; SSE-NEXT: pand %xmm5, %xmm3 8757; SSE-NEXT: pandn %xmm4, %xmm5 8758; SSE-NEXT: por %xmm3, %xmm5 8759; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,1,0,3] 8760; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] 8761; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] 8762; SSE-NEXT: packuswb %xmm4, %xmm4 8763; SSE-NEXT: pand %xmm7, %xmm4 8764; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 8765; SSE-NEXT: # xmm5 = mem[0,1,2,1] 8766; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] 8767; SSE-NEXT: packuswb %xmm5, %xmm5 8768; SSE-NEXT: pandn %xmm5, %xmm7 8769; SSE-NEXT: por %xmm4, %xmm7 8770; SSE-NEXT: movdqa %xmm10, %xmm3 8771; SSE-NEXT: pandn %xmm7, %xmm3 8772; SSE-NEXT: andps %xmm10, %xmm0 8773; SSE-NEXT: por %xmm0, %xmm3 8774; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8775; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 8776; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,65535,65535,0,65535] 8777; SSE-NEXT: pand %xmm13, %xmm4 8778; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 8779; SSE-NEXT: movdqa %xmm4, %xmm6 8780; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm12[8],xmm6[9],xmm12[9],xmm6[10],xmm12[10],xmm6[11],xmm12[11],xmm6[12],xmm12[12],xmm6[13],xmm12[13],xmm6[14],xmm12[14],xmm6[15],xmm12[15] 8781; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0] 8782; SSE-NEXT: movdqa %xmm0, %xmm7 8783; SSE-NEXT: pandn %xmm6, %xmm7 8784; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] 8785; SSE-NEXT: pand %xmm0, %xmm4 8786; SSE-NEXT: por %xmm7, %xmm4 8787; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] 8788; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,5,4,7,6] 8789; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 8790; SSE-NEXT: packuswb %xmm6, %xmm7 8791; SSE-NEXT: movdqa %xmm13, %xmm3 8792; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 8793; SSE-NEXT: pandn %xmm1, %xmm3 8794; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8795; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3] 8796; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 8797; SSE-NEXT: # xmm6 = mem[0,2,2,3] 8798; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] 8799; SSE-NEXT: movdqa %xmm6, %xmm4 8800; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm12[8],xmm4[9],xmm12[9],xmm4[10],xmm12[10],xmm4[11],xmm12[11],xmm4[12],xmm12[12],xmm4[13],xmm12[13],xmm4[14],xmm12[14],xmm4[15],xmm12[15] 8801; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] 8802; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] 8803; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] 8804; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] 8805; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] 8806; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] 8807; SSE-NEXT: packuswb %xmm6, %xmm6 8808; SSE-NEXT: movss {{.*#+}} xmm7 = xmm6[0],xmm7[1,2,3] 8809; SSE-NEXT: movdqa %xmm8, %xmm1 8810; SSE-NEXT: movdqa %xmm8, %xmm4 8811; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 8812; SSE-NEXT: movdqa %xmm14, %xmm8 8813; SSE-NEXT: pand %xmm1, %xmm8 8814; SSE-NEXT: movdqa %xmm1, %xmm14 8815; SSE-NEXT: por %xmm4, %xmm8 8816; SSE-NEXT: movdqa %xmm8, %xmm4 8817; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] 8818; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,0,65535,65535,65535] 8819; SSE-NEXT: movdqa %xmm1, %xmm6 8820; SSE-NEXT: pandn %xmm4, %xmm6 8821; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] 8822; SSE-NEXT: pand %xmm1, %xmm8 8823; SSE-NEXT: por %xmm6, %xmm8 8824; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8825; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 8826; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,0,3] 8827; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,6] 8828; SSE-NEXT: packuswb %xmm4, %xmm4 8829; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] 8830; SSE-NEXT: movdqa %xmm6, %xmm9 8831; SSE-NEXT: pandn %xmm4, %xmm9 8832; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,1,0,3] 8833; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,3,2,4,5,6,7] 8834; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] 8835; SSE-NEXT: packuswb %xmm4, %xmm4 8836; SSE-NEXT: pand %xmm6, %xmm4 8837; SSE-NEXT: por %xmm4, %xmm9 8838; SSE-NEXT: movdqa %xmm10, %xmm3 8839; SSE-NEXT: pandn %xmm9, %xmm3 8840; SSE-NEXT: andps %xmm10, %xmm7 8841; SSE-NEXT: movdqa %xmm10, %xmm5 8842; SSE-NEXT: por %xmm7, %xmm3 8843; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8844; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 8845; SSE-NEXT: movdqa %xmm13, %xmm10 8846; SSE-NEXT: pand %xmm13, %xmm7 8847; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 8848; SSE-NEXT: movdqa %xmm7, %xmm8 8849; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] 8850; SSE-NEXT: movdqa %xmm0, %xmm9 8851; SSE-NEXT: pandn %xmm8, %xmm9 8852; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] 8853; SSE-NEXT: pand %xmm0, %xmm7 8854; SSE-NEXT: por %xmm9, %xmm7 8855; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] 8856; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,5,4,7,6] 8857; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 8858; SSE-NEXT: packuswb %xmm8, %xmm9 8859; SSE-NEXT: movdqa %xmm13, %xmm4 8860; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8861; SSE-NEXT: pandn %xmm3, %xmm4 8862; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8863; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,3,2,3] 8864; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 8865; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,2,2,3] 8866; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] 8867; SSE-NEXT: movdqa %xmm8, %xmm7 8868; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm12[8],xmm7[9],xmm12[9],xmm7[10],xmm12[10],xmm7[11],xmm12[11],xmm7[12],xmm12[12],xmm7[13],xmm12[13],xmm7[14],xmm12[14],xmm7[15],xmm12[15] 8869; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] 8870; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] 8871; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] 8872; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] 8873; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] 8874; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] 8875; SSE-NEXT: packuswb %xmm8, %xmm8 8876; SSE-NEXT: movss {{.*#+}} xmm9 = xmm8[0],xmm9[1,2,3] 8877; SSE-NEXT: movdqa %xmm14, %xmm7 8878; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 8879; SSE-NEXT: movdqa %xmm15, %xmm8 8880; SSE-NEXT: pand %xmm14, %xmm8 8881; SSE-NEXT: por %xmm7, %xmm8 8882; SSE-NEXT: movdqa %xmm8, %xmm7 8883; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] 8884; SSE-NEXT: movdqa %xmm1, %xmm13 8885; SSE-NEXT: pandn %xmm7, %xmm13 8886; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] 8887; SSE-NEXT: pand %xmm1, %xmm8 8888; SSE-NEXT: por %xmm13, %xmm8 8889; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8890; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 8891; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,1,0,3] 8892; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6] 8893; SSE-NEXT: packuswb %xmm7, %xmm7 8894; SSE-NEXT: movdqa %xmm6, %xmm13 8895; SSE-NEXT: pandn %xmm7, %xmm13 8896; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,1,0,3] 8897; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,0,3,2,4,5,6,7] 8898; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] 8899; SSE-NEXT: packuswb %xmm7, %xmm7 8900; SSE-NEXT: pand %xmm6, %xmm7 8901; SSE-NEXT: por %xmm7, %xmm13 8902; SSE-NEXT: movdqa %xmm5, %xmm7 8903; SSE-NEXT: pandn %xmm13, %xmm7 8904; SSE-NEXT: andps %xmm5, %xmm9 8905; SSE-NEXT: por %xmm9, %xmm7 8906; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 8907; SSE-NEXT: pand %xmm10, %xmm8 8908; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 8909; SSE-NEXT: movdqa %xmm8, %xmm9 8910; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] 8911; SSE-NEXT: movdqa %xmm0, %xmm13 8912; SSE-NEXT: pandn %xmm9, %xmm13 8913; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] 8914; SSE-NEXT: pand %xmm0, %xmm8 8915; SSE-NEXT: por %xmm13, %xmm8 8916; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] 8917; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm8[0,1,2,3,5,4,7,6] 8918; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 8919; SSE-NEXT: packuswb %xmm9, %xmm15 8920; SSE-NEXT: movdqa %xmm10, %xmm13 8921; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8922; SSE-NEXT: pandn %xmm3, %xmm13 8923; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,3,2,3] 8924; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8925; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,2,2,3] 8926; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] 8927; SSE-NEXT: movdqa %xmm9, %xmm8 8928; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] 8929; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] 8930; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] 8931; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] 8932; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] 8933; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] 8934; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] 8935; SSE-NEXT: packuswb %xmm9, %xmm9 8936; SSE-NEXT: movss {{.*#+}} xmm15 = xmm9[0],xmm15[1,2,3] 8937; SSE-NEXT: movdqa %xmm14, %xmm8 8938; SSE-NEXT: pandn %xmm11, %xmm8 8939; SSE-NEXT: movdqa %xmm2, %xmm9 8940; SSE-NEXT: pand %xmm14, %xmm9 8941; SSE-NEXT: por %xmm8, %xmm9 8942; SSE-NEXT: movdqa %xmm9, %xmm8 8943; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] 8944; SSE-NEXT: movdqa %xmm1, %xmm11 8945; SSE-NEXT: pandn %xmm8, %xmm11 8946; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] 8947; SSE-NEXT: pand %xmm1, %xmm9 8948; SSE-NEXT: por %xmm11, %xmm9 8949; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 8950; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 8951; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,1,0,3] 8952; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6] 8953; SSE-NEXT: packuswb %xmm8, %xmm8 8954; SSE-NEXT: movdqa %xmm6, %xmm11 8955; SSE-NEXT: pandn %xmm8, %xmm11 8956; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[2,1,0,3] 8957; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,0,3,2,4,5,6,7] 8958; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] 8959; SSE-NEXT: packuswb %xmm8, %xmm8 8960; SSE-NEXT: pand %xmm6, %xmm8 8961; SSE-NEXT: por %xmm8, %xmm11 8962; SSE-NEXT: movdqa %xmm5, %xmm9 8963; SSE-NEXT: pandn %xmm11, %xmm9 8964; SSE-NEXT: andps %xmm5, %xmm15 8965; SSE-NEXT: por %xmm15, %xmm9 8966; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 8967; SSE-NEXT: pand %xmm10, %xmm8 8968; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 8969; SSE-NEXT: movdqa %xmm8, %xmm11 8970; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] 8971; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] 8972; SSE-NEXT: pand %xmm0, %xmm8 8973; SSE-NEXT: pandn %xmm11, %xmm0 8974; SSE-NEXT: por %xmm8, %xmm0 8975; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 8976; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] 8977; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 8978; SSE-NEXT: packuswb %xmm11, %xmm0 8979; SSE-NEXT: movdqa %xmm10, %xmm2 8980; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 8981; SSE-NEXT: pand %xmm10, %xmm15 8982; SSE-NEXT: pand %xmm10, %xmm4 8983; SSE-NEXT: pand %xmm10, %xmm3 8984; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8985; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8986; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,2,2,3] 8987; SSE-NEXT: pand %xmm10, %xmm3 8988; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8989; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 8990; SSE-NEXT: pandn %xmm3, %xmm2 8991; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,3,2,3] 8992; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1] 8993; SSE-NEXT: movdqa %xmm8, %xmm11 8994; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] 8995; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] 8996; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] 8997; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] 8998; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] 8999; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] 9000; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] 9001; SSE-NEXT: packuswb %xmm8, %xmm8 9002; SSE-NEXT: movss {{.*#+}} xmm0 = xmm8[0],xmm0[1,2,3] 9003; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 9004; SSE-NEXT: movdqa %xmm14, %xmm3 9005; SSE-NEXT: pand %xmm14, %xmm8 9006; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 9007; SSE-NEXT: por %xmm8, %xmm3 9008; SSE-NEXT: movdqa %xmm3, %xmm8 9009; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] 9010; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15] 9011; SSE-NEXT: pand %xmm1, %xmm3 9012; SSE-NEXT: pandn %xmm8, %xmm1 9013; SSE-NEXT: por %xmm3, %xmm1 9014; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 9015; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 9016; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,1,0,3] 9017; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6] 9018; SSE-NEXT: packuswb %xmm8, %xmm8 9019; SSE-NEXT: movdqa %xmm6, %xmm14 9020; SSE-NEXT: pandn %xmm8, %xmm14 9021; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] 9022; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7] 9023; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 9024; SSE-NEXT: packuswb %xmm1, %xmm1 9025; SSE-NEXT: pand %xmm6, %xmm1 9026; SSE-NEXT: por %xmm1, %xmm14 9027; SSE-NEXT: movdqa %xmm5, %xmm11 9028; SSE-NEXT: pandn %xmm14, %xmm11 9029; SSE-NEXT: andps %xmm5, %xmm0 9030; SSE-NEXT: por %xmm0, %xmm11 9031; SSE-NEXT: movdqa %xmm15, %xmm1 9032; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9033; SSE-NEXT: movdqa %xmm1, %xmm0 9034; SSE-NEXT: pxor %xmm3, %xmm3 9035; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 9036; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] 9037; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] 9038; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] 9039; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] 9040; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 9041; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9042; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] 9043; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 9044; SSE-NEXT: pandn %xmm1, %xmm10 9045; SSE-NEXT: movdqa %xmm1, %xmm8 9046; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9047; SSE-NEXT: por %xmm10, %xmm1 9048; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] 9049; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] 9050; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 9051; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] 9052; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[3,3,3,3] 9053; SSE-NEXT: packuswb %xmm8, %xmm1 9054; SSE-NEXT: packuswb %xmm0, %xmm0 9055; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 9056; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9057; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9058; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 9059; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] 9060; SSE-NEXT: packuswb %xmm0, %xmm0 9061; SSE-NEXT: movdqa %xmm6, %xmm8 9062; SSE-NEXT: pandn %xmm0, %xmm8 9063; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 9064; SSE-NEXT: # xmm0 = mem[1,3,2,3] 9065; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 9066; SSE-NEXT: # xmm14 = mem[0,2,2,3] 9067; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] 9068; SSE-NEXT: movdqa %xmm14, %xmm0 9069; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 9070; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,0,65535,65535] 9071; SSE-NEXT: movdqa %xmm10, %xmm15 9072; SSE-NEXT: pandn %xmm0, %xmm15 9073; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm3[8],xmm14[9],xmm3[9],xmm14[10],xmm3[10],xmm14[11],xmm3[11],xmm14[12],xmm3[12],xmm14[13],xmm3[13],xmm14[14],xmm3[14],xmm14[15],xmm3[15] 9074; SSE-NEXT: pand %xmm10, %xmm14 9075; SSE-NEXT: por %xmm15, %xmm14 9076; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,1,1,1] 9077; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] 9078; SSE-NEXT: packuswb %xmm0, %xmm14 9079; SSE-NEXT: pand %xmm6, %xmm14 9080; SSE-NEXT: por %xmm8, %xmm14 9081; SSE-NEXT: movdqa %xmm5, %xmm3 9082; SSE-NEXT: pandn %xmm14, %xmm3 9083; SSE-NEXT: andps %xmm5, %xmm1 9084; SSE-NEXT: por %xmm1, %xmm3 9085; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 9086; SSE-NEXT: movdqa %xmm4, %xmm1 9087; SSE-NEXT: pxor %xmm0, %xmm0 9088; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 9089; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] 9090; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 9091; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,1,2,1] 9092; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] 9093; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 9094; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 9095; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] 9096; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 9097; SSE-NEXT: pandn %xmm4, %xmm12 9098; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload 9099; SSE-NEXT: por %xmm12, %xmm8 9100; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,1,3] 9101; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7] 9102; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] 9103; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] 9104; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[3,3,3,3] 9105; SSE-NEXT: packuswb %xmm12, %xmm8 9106; SSE-NEXT: packuswb %xmm1, %xmm1 9107; SSE-NEXT: movss {{.*#+}} xmm8 = xmm1[0],xmm8[1,2,3] 9108; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9109; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9110; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] 9111; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] 9112; SSE-NEXT: packuswb %xmm1, %xmm1 9113; SSE-NEXT: movdqa %xmm6, %xmm12 9114; SSE-NEXT: pandn %xmm1, %xmm12 9115; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9116; SSE-NEXT: # xmm1 = mem[1,3,2,3] 9117; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 9118; SSE-NEXT: # xmm14 = mem[0,2,2,3] 9119; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] 9120; SSE-NEXT: movdqa %xmm14, %xmm1 9121; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 9122; SSE-NEXT: movdqa %xmm10, %xmm15 9123; SSE-NEXT: pandn %xmm1, %xmm15 9124; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm0[8],xmm14[9],xmm0[9],xmm14[10],xmm0[10],xmm14[11],xmm0[11],xmm14[12],xmm0[12],xmm14[13],xmm0[13],xmm14[14],xmm0[14],xmm14[15],xmm0[15] 9125; SSE-NEXT: pand %xmm10, %xmm14 9126; SSE-NEXT: por %xmm15, %xmm14 9127; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,1,1,1] 9128; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] 9129; SSE-NEXT: packuswb %xmm1, %xmm1 9130; SSE-NEXT: pand %xmm6, %xmm1 9131; SSE-NEXT: por %xmm12, %xmm1 9132; SSE-NEXT: movdqa %xmm5, %xmm12 9133; SSE-NEXT: pandn %xmm1, %xmm12 9134; SSE-NEXT: andps %xmm5, %xmm8 9135; SSE-NEXT: movdqa %xmm5, %xmm4 9136; SSE-NEXT: por %xmm8, %xmm12 9137; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9138; SSE-NEXT: por %xmm13, %xmm0 9139; SSE-NEXT: movdqa %xmm0, %xmm1 9140; SSE-NEXT: pxor %xmm13, %xmm13 9141; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] 9142; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] 9143; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] 9144; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,2,1] 9145; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] 9146; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 9147; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 9148; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] 9149; SSE-NEXT: pxor %xmm0, %xmm0 9150; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 9151; SSE-NEXT: pandn %xmm5, %xmm13 9152; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 9153; SSE-NEXT: por %xmm13, %xmm8 9154; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,1,3] 9155; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7] 9156; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] 9157; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] 9158; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[3,3,3,3] 9159; SSE-NEXT: packuswb %xmm13, %xmm8 9160; SSE-NEXT: packuswb %xmm1, %xmm1 9161; SSE-NEXT: movss {{.*#+}} xmm8 = xmm1[0],xmm8[1,2,3] 9162; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9163; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9164; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] 9165; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] 9166; SSE-NEXT: packuswb %xmm1, %xmm1 9167; SSE-NEXT: movdqa %xmm6, %xmm13 9168; SSE-NEXT: pandn %xmm1, %xmm13 9169; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9170; SSE-NEXT: # xmm1 = mem[1,3,2,3] 9171; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 9172; SSE-NEXT: # xmm14 = mem[0,2,2,3] 9173; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] 9174; SSE-NEXT: movdqa %xmm14, %xmm1 9175; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 9176; SSE-NEXT: movdqa %xmm10, %xmm15 9177; SSE-NEXT: pandn %xmm1, %xmm15 9178; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm0[8],xmm14[9],xmm0[9],xmm14[10],xmm0[10],xmm14[11],xmm0[11],xmm14[12],xmm0[12],xmm14[13],xmm0[13],xmm14[14],xmm0[14],xmm14[15],xmm0[15] 9179; SSE-NEXT: pand %xmm10, %xmm14 9180; SSE-NEXT: por %xmm15, %xmm14 9181; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,1,1,1] 9182; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] 9183; SSE-NEXT: packuswb %xmm1, %xmm1 9184; SSE-NEXT: pand %xmm6, %xmm1 9185; SSE-NEXT: por %xmm13, %xmm1 9186; SSE-NEXT: movdqa %xmm4, %xmm0 9187; SSE-NEXT: movdqa %xmm4, %xmm13 9188; SSE-NEXT: pandn %xmm1, %xmm13 9189; SSE-NEXT: andps %xmm4, %xmm8 9190; SSE-NEXT: por %xmm8, %xmm13 9191; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 9192; SSE-NEXT: movdqa %xmm2, %xmm1 9193; SSE-NEXT: pxor %xmm14, %xmm14 9194; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] 9195; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] 9196; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] 9197; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,1,2,1] 9198; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] 9199; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] 9200; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 9201; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] 9202; SSE-NEXT: pxor %xmm15, %xmm15 9203; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 9204; SSE-NEXT: pandn %xmm2, %xmm5 9205; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 9206; SSE-NEXT: por %xmm5, %xmm4 9207; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,2,1,3] 9208; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7] 9209; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] 9210; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm8[0,1,2,3,4,7,6,5] 9211; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[3,3,3,3] 9212; SSE-NEXT: packuswb %xmm8, %xmm14 9213; SSE-NEXT: packuswb %xmm1, %xmm1 9214; SSE-NEXT: movss {{.*#+}} xmm14 = xmm1[0],xmm14[1,2,3] 9215; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 9216; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 9217; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 9218; SSE-NEXT: # xmm1 = mem[1,3,2,3] 9219; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 9220; SSE-NEXT: # xmm8 = mem[0,2,2,3] 9221; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] 9222; SSE-NEXT: movdqa %xmm8, %xmm1 9223; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] 9224; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm15[8],xmm8[9],xmm15[9],xmm8[10],xmm15[10],xmm8[11],xmm15[11],xmm8[12],xmm15[12],xmm8[13],xmm15[13],xmm8[14],xmm15[14],xmm8[15],xmm15[15] 9225; SSE-NEXT: pand %xmm10, %xmm8 9226; SSE-NEXT: pandn %xmm1, %xmm10 9227; SSE-NEXT: por %xmm8, %xmm10 9228; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,1,1,1] 9229; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] 9230; SSE-NEXT: packuswb %xmm1, %xmm1 9231; SSE-NEXT: pand %xmm6, %xmm1 9232; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,1,0,3] 9233; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] 9234; SSE-NEXT: packuswb %xmm8, %xmm8 9235; SSE-NEXT: pandn %xmm8, %xmm6 9236; SSE-NEXT: por %xmm6, %xmm1 9237; SSE-NEXT: andps %xmm0, %xmm14 9238; SSE-NEXT: pandn %xmm1, %xmm0 9239; SSE-NEXT: por %xmm14, %xmm0 9240; SSE-NEXT: movdqa %xmm0, %xmm1 9241; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9242; SSE-NEXT: movaps %xmm0, (%rsi) 9243; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9244; SSE-NEXT: movaps %xmm0, 48(%rsi) 9245; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9246; SSE-NEXT: movaps %xmm0, 32(%rsi) 9247; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9248; SSE-NEXT: movaps %xmm0, 16(%rsi) 9249; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9250; SSE-NEXT: movaps %xmm0, (%rdx) 9251; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9252; SSE-NEXT: movaps %xmm0, 48(%rdx) 9253; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9254; SSE-NEXT: movaps %xmm0, 32(%rdx) 9255; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9256; SSE-NEXT: movaps %xmm0, 16(%rdx) 9257; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9258; SSE-NEXT: movaps %xmm0, (%rcx) 9259; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9260; SSE-NEXT: movaps %xmm0, 48(%rcx) 9261; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9262; SSE-NEXT: movaps %xmm0, 32(%rcx) 9263; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9264; SSE-NEXT: movaps %xmm0, 16(%rcx) 9265; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9266; SSE-NEXT: movaps %xmm0, (%r8) 9267; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9268; SSE-NEXT: movaps %xmm0, 48(%r8) 9269; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9270; SSE-NEXT: movaps %xmm0, 32(%r8) 9271; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9272; SSE-NEXT: movaps %xmm0, 16(%r8) 9273; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9274; SSE-NEXT: movaps %xmm0, (%r9) 9275; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9276; SSE-NEXT: movaps %xmm0, 48(%r9) 9277; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9278; SSE-NEXT: movaps %xmm0, 32(%r9) 9279; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9280; SSE-NEXT: movaps %xmm0, 16(%r9) 9281; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 9282; SSE-NEXT: movdqa %xmm11, (%rax) 9283; SSE-NEXT: movdqa %xmm9, 48(%rax) 9284; SSE-NEXT: movdqa %xmm7, 32(%rax) 9285; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9286; SSE-NEXT: movaps %xmm0, 16(%rax) 9287; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 9288; SSE-NEXT: movdqa %xmm1, (%rax) 9289; SSE-NEXT: movdqa %xmm13, 48(%rax) 9290; SSE-NEXT: movdqa %xmm12, 32(%rax) 9291; SSE-NEXT: movdqa %xmm3, 16(%rax) 9292; SSE-NEXT: addq $1528, %rsp # imm = 0x5F8 9293; SSE-NEXT: retq 9294; 9295; AVX-LABEL: load_i8_stride7_vf64: 9296; AVX: # %bb.0: 9297; AVX-NEXT: subq $744, %rsp # imm = 0x2E8 9298; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [128,128,6,13,0,0,0,128,128,128,6,13,0,0,0,128] 9299; AVX-NEXT: # xmm0 = mem[0,0] 9300; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 9301; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9302; AVX-NEXT: vmovdqa 176(%rdi), %xmm10 9303; AVX-NEXT: vpshufb %xmm0, %xmm10, %xmm1 9304; AVX-NEXT: vmovdqa 48(%rdi), %xmm8 9305; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9306; AVX-NEXT: vmovq {{.*#+}} xmm2 = [128,128,128,5,12,0,0,0,0,0,0,0,0,0,0,0] 9307; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm4 9308; AVX-NEXT: vmovq {{.*#+}} xmm3 = [0,7,14,128,128,0,0,0,0,0,0,0,0,0,0,0] 9309; AVX-NEXT: vmovdqa (%rdi), %xmm5 9310; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9311; AVX-NEXT: vpshufb %xmm3, %xmm5, %xmm5 9312; AVX-NEXT: vpor %xmm4, %xmm5, %xmm7 9313; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,0,0,0,3,10,128,128,128,0,0,0,3,10,128] 9314; AVX-NEXT: # xmm4 = mem[0,0] 9315; AVX-NEXT: vmovdqa 32(%rdi), %xmm5 9316; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9317; AVX-NEXT: vpshufb %xmm4, %xmm5, %xmm6 9318; AVX-NEXT: vmovddup {{.*#+}} xmm5 = [8,15,0,0,0,128,128,1,8,15,0,0,0,128,128,1] 9319; AVX-NEXT: # xmm5 = mem[0,0] 9320; AVX-NEXT: vpshufb %xmm5, %xmm8, %xmm8 9321; AVX-NEXT: vpor %xmm6, %xmm8, %xmm8 9322; AVX-NEXT: vmovq {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] 9323; AVX-NEXT: vpblendvb %xmm6, %xmm7, %xmm8, %xmm7 9324; AVX-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9325; AVX-NEXT: vmovdqa 240(%rdi), %xmm7 9326; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9327; AVX-NEXT: vpshufb %xmm2, %xmm7, %xmm2 9328; AVX-NEXT: vmovdqa 224(%rdi), %xmm7 9329; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9330; AVX-NEXT: vpshufb %xmm3, %xmm7, %xmm3 9331; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 9332; AVX-NEXT: vmovdqa 256(%rdi), %xmm3 9333; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9334; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm3 9335; AVX-NEXT: vmovdqa 272(%rdi), %xmm4 9336; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9337; AVX-NEXT: vpshufb %xmm5, %xmm4, %xmm4 9338; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 9339; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [8,15,128,128,0,0,0,1,8,15,128,128,0,0,0,1] 9340; AVX-NEXT: # xmm4 = mem[0,0] 9341; AVX-NEXT: vpblendvb %xmm6, %xmm2, %xmm3, %xmm2 9342; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9343; AVX-NEXT: vmovdqa 160(%rdi), %xmm7 9344; AVX-NEXT: vpshufb %xmm4, %xmm7, %xmm2 9345; AVX-NEXT: vpor %xmm1, %xmm2, %xmm2 9346; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128] 9347; AVX-NEXT: vmovdqa 144(%rdi), %xmm12 9348; AVX-NEXT: vpshufb %xmm3, %xmm12, %xmm1 9349; AVX-NEXT: vbroadcastss {{.*#+}} xmm5 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5] 9350; AVX-NEXT: vmovdqa 128(%rdi), %xmm14 9351; AVX-NEXT: vpshufb %xmm5, %xmm14, %xmm9 9352; AVX-NEXT: vpor %xmm1, %xmm9, %xmm9 9353; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,0,0,0,0,255,255,255,255,255,u,u,u,u] 9354; AVX-NEXT: vpblendvb %xmm15, %xmm2, %xmm9, %xmm2 9355; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9356; AVX-NEXT: vmovdqa 400(%rdi), %xmm9 9357; AVX-NEXT: vpshufb %xmm0, %xmm9, %xmm0 9358; AVX-NEXT: vmovdqa 384(%rdi), %xmm6 9359; AVX-NEXT: vpshufb %xmm4, %xmm6, %xmm2 9360; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 9361; AVX-NEXT: vmovdqa 368(%rdi), %xmm8 9362; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm2 9363; AVX-NEXT: vmovdqa 352(%rdi), %xmm11 9364; AVX-NEXT: vpshufb %xmm5, %xmm11, %xmm3 9365; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 9366; AVX-NEXT: vpblendvb %xmm15, %xmm0, %xmm2, %xmm0 9367; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9368; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [9,128,128,128,0,0,0,2,9,128,128,128,0,0,0,2] 9369; AVX-NEXT: # xmm0 = mem[0,0] 9370; AVX-NEXT: vpshufb %xmm0, %xmm7, %xmm3 9371; AVX-NEXT: vmovdqa %xmm7, %xmm1 9372; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9373; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [128,0,7,14,0,0,0,128,128,0,7,14,0,0,0,128] 9374; AVX-NEXT: # xmm2 = mem[0,0] 9375; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9376; AVX-NEXT: vpshufb %xmm2, %xmm10, %xmm4 9377; AVX-NEXT: vpor %xmm3, %xmm4, %xmm5 9378; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] 9379; AVX-NEXT: vpshufb %xmm3, %xmm12, %xmm13 9380; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9381; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [13,128,128,6,13,128,128,6,13,128,128,6,13,128,128,6] 9382; AVX-NEXT: vmovdqa %xmm14, %xmm7 9383; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9384; AVX-NEXT: vpshufb %xmm4, %xmm14, %xmm14 9385; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13 9386; AVX-NEXT: vpblendvb %xmm15, %xmm5, %xmm13, %xmm5 9387; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9388; AVX-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill 9389; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm0 9390; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9391; AVX-NEXT: vpshufb %xmm2, %xmm9, %xmm2 9392; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 9393; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9394; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm2 9395; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9396; AVX-NEXT: vpshufb %xmm4, %xmm11, %xmm3 9397; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 9398; AVX-NEXT: vpblendvb %xmm15, %xmm0, %xmm2, %xmm0 9399; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9400; AVX-NEXT: vmovq {{.*#+}} xmm0 = [0,0,128,128,128,5,12,0,0,0,0,0,0,0,0,0] 9401; AVX-NEXT: vpshufb %xmm0, %xmm12, %xmm2 9402; AVX-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,7,14,128,128,0,0,0,0,0,0,0,0,0] 9403; AVX-NEXT: vpshufb %xmm12, %xmm7, %xmm3 9404; AVX-NEXT: vpor %xmm2, %xmm3, %xmm5 9405; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [10,128,128,128,0,0,0,3,10,128,128,128,0,0,0,3] 9406; AVX-NEXT: # xmm2 = mem[0,0] 9407; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm4 9408; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [128,1,8,15,0,0,0,128,128,1,8,15,0,0,0,128] 9409; AVX-NEXT: # xmm3 = mem[0,0] 9410; AVX-NEXT: vpshufb %xmm3, %xmm10, %xmm13 9411; AVX-NEXT: vpor %xmm4, %xmm13, %xmm13 9412; AVX-NEXT: vmovq {{.*#+}} xmm4 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0] 9413; AVX-NEXT: vpblendvb %xmm4, %xmm5, %xmm13, %xmm5 9414; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9415; AVX-NEXT: vpshufb %xmm0, %xmm8, %xmm0 9416; AVX-NEXT: vpshufb %xmm12, %xmm11, %xmm1 9417; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 9418; AVX-NEXT: vpshufb %xmm2, %xmm6, %xmm1 9419; AVX-NEXT: vpshufb %xmm3, %xmm9, %xmm2 9420; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 9421; AVX-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm0 9422; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9423; AVX-NEXT: vmovq {{.*#+}} xmm14 = [128,128,128,6,13,0,0,0,0,0,0,0,0,0,0,0] 9424; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 9425; AVX-NEXT: vpshufb %xmm14, %xmm11, %xmm0 9426; AVX-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,128,128,0,0,0,0,0,0,0,0,0,0,0] 9427; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 9428; AVX-NEXT: vpshufb %xmm2, %xmm10, %xmm3 9429; AVX-NEXT: vpor %xmm0, %xmm3, %xmm3 9430; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [9,128,128,2,9,128,128,2,9,128,128,2,9,128,128,2] 9431; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 9432; AVX-NEXT: vpshufb %xmm4, %xmm15, %xmm0 9433; AVX-NEXT: vbroadcastss {{.*#+}} xmm5 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] 9434; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 9435; AVX-NEXT: vpshufb %xmm5, %xmm7, %xmm13 9436; AVX-NEXT: vpor %xmm0, %xmm13, %xmm13 9437; AVX-NEXT: vmovq {{.*#+}} xmm12 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] 9438; AVX-NEXT: vpblendvb %xmm12, %xmm3, %xmm13, %xmm0 9439; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9440; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9441; AVX-NEXT: vpshufb %xmm14, %xmm1, %xmm0 9442; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 9443; AVX-NEXT: vpshufb %xmm2, %xmm13, %xmm2 9444; AVX-NEXT: vpor %xmm0, %xmm2, %xmm14 9445; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 9446; AVX-NEXT: vpshufb %xmm4, %xmm8, %xmm2 9447; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9448; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm3 9449; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 9450; AVX-NEXT: vpblendvb %xmm12, %xmm14, %xmm2, %xmm2 9451; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9452; AVX-NEXT: vmovq {{.*#+}} xmm9 = [2,9,128,128,128,0,0,0,0,0,0,0,0,0,0,0] 9453; AVX-NEXT: vpshufb %xmm9, %xmm10, %xmm2 9454; AVX-NEXT: vmovq {{.*#+}} xmm3 = [128,128,0,7,14,0,0,0,0,0,0,0,0,0,0,0] 9455; AVX-NEXT: vpshufb %xmm3, %xmm11, %xmm4 9456; AVX-NEXT: vpor %xmm2, %xmm4, %xmm2 9457; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3] 9458; AVX-NEXT: vpshufb %xmm4, %xmm15, %xmm5 9459; AVX-NEXT: vbroadcastss {{.*#+}} xmm6 = [128,5,12,128,128,5,12,128,128,5,12,128,128,5,12,128] 9460; AVX-NEXT: vpshufb %xmm6, %xmm7, %xmm14 9461; AVX-NEXT: vpor %xmm5, %xmm14, %xmm5 9462; AVX-NEXT: vmovdqa %xmm12, %xmm14 9463; AVX-NEXT: vpblendvb %xmm12, %xmm2, %xmm5, %xmm2 9464; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9465; AVX-NEXT: vpshufb %xmm9, %xmm13, %xmm5 9466; AVX-NEXT: vmovdqa %xmm13, %xmm9 9467; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm2 9468; AVX-NEXT: vmovdqa %xmm1, %xmm12 9469; AVX-NEXT: vpor %xmm5, %xmm2, %xmm1 9470; AVX-NEXT: vpshufb %xmm4, %xmm8, %xmm2 9471; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm3 9472; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 9473; AVX-NEXT: vpblendvb %xmm14, %xmm1, %xmm2, %xmm1 9474; AVX-NEXT: vmovdqa %xmm14, %xmm6 9475; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9476; AVX-NEXT: vmovq {{.*#+}} xmm1 = [3,10,128,128,128,0,0,0,0,0,0,0,0,0,0,0] 9477; AVX-NEXT: vpshufb %xmm1, %xmm10, %xmm3 9478; AVX-NEXT: vmovq {{.*#+}} xmm2 = [128,128,1,8,15,0,0,0,0,0,0,0,0,0,0,0] 9479; AVX-NEXT: vpshufb %xmm2, %xmm11, %xmm4 9480; AVX-NEXT: vpor %xmm3, %xmm4, %xmm5 9481; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4] 9482; AVX-NEXT: vpshufb %xmm3, %xmm15, %xmm13 9483; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,6,13,128,128,6,13,128,128,6,13,128,128,6,13,128] 9484; AVX-NEXT: vpshufb %xmm4, %xmm7, %xmm14 9485; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13 9486; AVX-NEXT: vpblendvb %xmm6, %xmm5, %xmm13, %xmm5 9487; AVX-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9488; AVX-NEXT: vpshufb %xmm1, %xmm9, %xmm1 9489; AVX-NEXT: vpshufb %xmm2, %xmm12, %xmm2 9490; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 9491; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm2 9492; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm3 9493; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 9494; AVX-NEXT: vpblendvb %xmm6, %xmm1, %xmm2, %xmm0 9495; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9496; AVX-NEXT: vmovq {{.*#+}} xmm0 = [0,0,128,128,128,6,13,0,0,0,0,0,0,0,0,0] 9497; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 9498; AVX-NEXT: vpshufb %xmm0, %xmm11, %xmm1 9499; AVX-NEXT: vmovq {{.*#+}} xmm3 = [0,0,1,8,15,128,128,0,0,0,0,0,0,0,0,0] 9500; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 9501; AVX-NEXT: vpshufb %xmm3, %xmm15, %xmm2 9502; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 9503; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,2,9,128,128,2,9,128,128,2,9,128,128,2,9,128] 9504; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 9505; AVX-NEXT: vpshufb %xmm4, %xmm8, %xmm2 9506; AVX-NEXT: vbroadcastss {{.*#+}} xmm5 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4] 9507; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 9508; AVX-NEXT: vpshufb %xmm5, %xmm12, %xmm13 9509; AVX-NEXT: vpor %xmm2, %xmm13, %xmm13 9510; AVX-NEXT: vmovq {{.*#+}} xmm2 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0] 9511; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm13, %xmm1 9512; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9513; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 9514; AVX-NEXT: vpshufb %xmm0, %xmm7, %xmm0 9515; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 9516; AVX-NEXT: vpshufb %xmm3, %xmm10, %xmm1 9517; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 9518; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 9519; AVX-NEXT: vpshufb %xmm4, %xmm9, %xmm1 9520; AVX-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload 9521; AVX-NEXT: vpshufb %xmm5, %xmm6, %xmm3 9522; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 9523; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 9524; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9525; AVX-NEXT: vmovq {{.*#+}} xmm1 = [0,0,2,9,128,128,128,0,0,0,0,0,0,0,0,0] 9526; AVX-NEXT: vpshufb %xmm1, %xmm15, %xmm0 9527; AVX-NEXT: vmovq {{.*#+}} xmm3 = [0,0,128,128,0,7,14,0,0,0,0,0,0,0,0,0] 9528; AVX-NEXT: vpshufb %xmm3, %xmm11, %xmm4 9529; AVX-NEXT: vpor %xmm0, %xmm4, %xmm5 9530; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128] 9531; AVX-NEXT: vpshufb %xmm4, %xmm8, %xmm13 9532; AVX-NEXT: vmovdqa %xmm8, %xmm11 9533; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5] 9534; AVX-NEXT: vpshufb %xmm0, %xmm12, %xmm14 9535; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13 9536; AVX-NEXT: vmovdqa %xmm2, %xmm8 9537; AVX-NEXT: vpblendvb %xmm2, %xmm5, %xmm13, %xmm2 9538; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9539; AVX-NEXT: vpshufb %xmm1, %xmm10, %xmm1 9540; AVX-NEXT: vmovdqa %xmm10, %xmm2 9541; AVX-NEXT: vpshufb %xmm3, %xmm7, %xmm3 9542; AVX-NEXT: vmovdqa %xmm7, %xmm10 9543; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 9544; AVX-NEXT: vpshufb %xmm4, %xmm9, %xmm3 9545; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm0 9546; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0 9547; AVX-NEXT: vpblendvb %xmm8, %xmm1, %xmm0, %xmm14 9548; AVX-NEXT: vmovq {{.*#+}} xmm1 = [0,0,3,10,128,128,128,0,0,0,0,0,0,0,0,0] 9549; AVX-NEXT: vpshufb %xmm1, %xmm15, %xmm0 9550; AVX-NEXT: vmovq {{.*#+}} xmm3 = [0,0,128,128,1,8,15,0,0,0,0,0,0,0,0,0] 9551; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 9552; AVX-NEXT: vpshufb %xmm3, %xmm7, %xmm4 9553; AVX-NEXT: vpor %xmm0, %xmm4, %xmm4 9554; AVX-NEXT: vbroadcastss {{.*#+}} xmm13 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] 9555; AVX-NEXT: vpshufb %xmm13, %xmm11, %xmm5 9556; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [13,128,128,6,13,128,128,6,13,128,128,6,13,128,128,6] 9557; AVX-NEXT: vpshufb %xmm0, %xmm12, %xmm11 9558; AVX-NEXT: vpor %xmm5, %xmm11, %xmm5 9559; AVX-NEXT: vmovdqa %xmm8, %xmm11 9560; AVX-NEXT: vpblendvb %xmm8, %xmm4, %xmm5, %xmm8 9561; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm1 9562; AVX-NEXT: vpshufb %xmm3, %xmm10, %xmm3 9563; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 9564; AVX-NEXT: vpshufb %xmm13, %xmm9, %xmm3 9565; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm0 9566; AVX-NEXT: vmovdqa %xmm6, %xmm9 9567; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0 9568; AVX-NEXT: vpblendvb %xmm11, %xmm1, %xmm0, %xmm2 9569; AVX-NEXT: vmovdqa 208(%rdi), %xmm6 9570; AVX-NEXT: vmovdqa 192(%rdi), %xmm5 9571; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] 9572; AVX-NEXT: # xmm3 = mem[0,0] 9573; AVX-NEXT: vpshufb %xmm3, %xmm6, %xmm4 9574; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] 9575; AVX-NEXT: # xmm11 = mem[0,0] 9576; AVX-NEXT: vpshufb %xmm11, %xmm5, %xmm13 9577; AVX-NEXT: vpor %xmm4, %xmm13, %xmm4 9578; AVX-NEXT: vpmovsxdq {{.*#+}} xmm13 = [18446744073709486080,16777215] 9579; AVX-NEXT: vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload 9580; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9581; AVX-NEXT: vmovdqa 432(%rdi), %xmm4 9582; AVX-NEXT: vpshufb %xmm3, %xmm4, %xmm0 9583; AVX-NEXT: vmovdqa 416(%rdi), %xmm3 9584; AVX-NEXT: vpshufb %xmm11, %xmm3, %xmm11 9585; AVX-NEXT: vpor %xmm0, %xmm11, %xmm0 9586; AVX-NEXT: vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 9587; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9588; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] 9589; AVX-NEXT: # xmm1 = mem[0,0] 9590; AVX-NEXT: vpshufb %xmm1, %xmm6, %xmm11 9591; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] 9592; AVX-NEXT: # xmm0 = mem[0,0] 9593; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm15 9594; AVX-NEXT: vpor %xmm11, %xmm15, %xmm11 9595; AVX-NEXT: vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload 9596; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9597; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm1 9598; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 9599; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 9600; AVX-NEXT: vpblendvb %xmm13, %xmm14, %xmm0, %xmm0 9601; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9602; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] 9603; AVX-NEXT: # xmm0 = mem[0,0] 9604; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm1 9605; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] 9606; AVX-NEXT: # xmm11 = mem[0,0] 9607; AVX-NEXT: vpshufb %xmm11, %xmm6, %xmm14 9608; AVX-NEXT: vpor %xmm1, %xmm14, %xmm1 9609; AVX-NEXT: vpblendvb %xmm13, %xmm8, %xmm1, %xmm1 9610; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9611; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 9612; AVX-NEXT: vpshufb %xmm11, %xmm4, %xmm1 9613; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 9614; AVX-NEXT: vpblendvb %xmm13, %xmm2, %xmm0, %xmm0 9615; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9616; AVX-NEXT: vmovd {{.*#+}} xmm14 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 9617; AVX-NEXT: vpshufb %xmm14, %xmm7, %xmm1 9618; AVX-NEXT: vmovd {{.*#+}} xmm0 = [0,0,4,11,0,0,0,0,0,0,0,0,0,0,0,0] 9619; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 9620; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm2 9621; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 9622; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [128,5,12,0,0,0,128,128,128,5,12,0,0,0,128,128] 9623; AVX-NEXT: # xmm1 = mem[0,0] 9624; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 9625; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm8 9626; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [14,128,128,0,0,0,0,7,14,128,128,0,0,0,0,7] 9627; AVX-NEXT: # xmm2 = mem[0,0] 9628; AVX-NEXT: vpshufb %xmm2, %xmm12, %xmm10 9629; AVX-NEXT: vpor %xmm8, %xmm10, %xmm8 9630; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm7[1,2],xmm8[3,4,5,6,7] 9631; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] 9632; AVX-NEXT: # xmm7 = mem[0,0] 9633; AVX-NEXT: vpshufb %xmm7, %xmm5, %xmm10 9634; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9635; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] 9636; AVX-NEXT: # xmm11 = mem[0,0] 9637; AVX-NEXT: vpshufb %xmm11, %xmm6, %xmm12 9638; AVX-NEXT: vpor %xmm10, %xmm12, %xmm10 9639; AVX-NEXT: vpblendvb %xmm13, %xmm8, %xmm10, %xmm8 9640; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9641; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 9642; AVX-NEXT: vpshufb %xmm0, %xmm8, %xmm0 9643; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 9644; AVX-NEXT: vpshufb %xmm14, %xmm8, %xmm8 9645; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] 9646; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 9647; AVX-NEXT: vpshufb %xmm1, %xmm8, %xmm1 9648; AVX-NEXT: vpshufb %xmm2, %xmm9, %xmm2 9649; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 9650; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3,4,5,6,7] 9651; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm1 9652; AVX-NEXT: vmovdqa %xmm4, %xmm8 9653; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9654; AVX-NEXT: vpshufb %xmm11, %xmm4, %xmm2 9655; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 9656; AVX-NEXT: vpblendvb %xmm13, %xmm0, %xmm1, %xmm0 9657; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 9658; AVX-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,2,9,0,0,2,9,0,0,2,9,0,0,2,9] 9659; AVX-NEXT: vmovdqa %xmm6, %xmm13 9660; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9661; AVX-NEXT: vpshufb %xmm14, %xmm6, %xmm1 9662; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] 9663; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm2 9664; AVX-NEXT: vmovdqa %xmm0, %xmm5 9665; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 9666; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 9667; AVX-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7] 9668; AVX-NEXT: vbroadcastss {{.*#+}} xmm15 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] 9669; AVX-NEXT: vmovdqa 80(%rdi), %xmm4 9670; AVX-NEXT: vpshufb %xmm15, %xmm4, %xmm7 9671; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9672; AVX-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] 9673; AVX-NEXT: vmovdqa 64(%rdi), %xmm0 9674; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9675; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm10 9676; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] 9677; AVX-NEXT: vmovdqa 96(%rdi), %xmm0 9678; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9679; AVX-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] 9680; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,1,2,3,8,15,0,0,0,1,2,3,8,15] 9681; AVX-NEXT: # xmm11 = mem[0,0] 9682; AVX-NEXT: vpshufb %xmm11, %xmm7, %xmm10 9683; AVX-NEXT: vmovd {{.*#+}} xmm0 = [0,7,14,0,0,0,0,0,0,0,0,0,0,0,0,0] 9684; AVX-NEXT: vmovdqa 112(%rdi), %xmm2 9685; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9686; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm12 9687; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 9688; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 9689; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload 9690; AVX-NEXT: vandnps %ymm10, %ymm2, %ymm10 9691; AVX-NEXT: vorps %ymm10, %ymm12, %ymm10 9692; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 9693; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] 9694; AVX-NEXT: vandnps %ymm1, %ymm12, %ymm1 9695; AVX-NEXT: vandps %ymm12, %ymm10, %ymm10 9696; AVX-NEXT: vorps %ymm1, %ymm10, %ymm1 9697; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9698; AVX-NEXT: vpshufb %xmm14, %xmm8, %xmm1 9699; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm10 9700; AVX-NEXT: vmovdqa %xmm3, %xmm5 9701; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9702; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] 9703; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 9704; AVX-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7] 9705; AVX-NEXT: vmovdqa 304(%rdi), %xmm3 9706; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9707; AVX-NEXT: vpshufb %xmm15, %xmm3, %xmm14 9708; AVX-NEXT: vmovdqa 288(%rdi), %xmm3 9709; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9710; AVX-NEXT: vpshufb %xmm6, %xmm3, %xmm15 9711; AVX-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 9712; AVX-NEXT: vmovdqa 320(%rdi), %xmm9 9713; AVX-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7,8,9] 9714; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9715; AVX-NEXT: vpshufb %xmm11, %xmm14, %xmm11 9716; AVX-NEXT: vmovdqa 336(%rdi), %xmm8 9717; AVX-NEXT: vpshufb %xmm0, %xmm8, %xmm0 9718; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9719; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 9720; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload 9721; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0 9722; AVX-NEXT: vorps %ymm0, %ymm11, %ymm0 9723; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 9724; AVX-NEXT: vandnps %ymm1, %ymm12, %ymm1 9725; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0 9726; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 9727; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9728; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] 9729; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 9730; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] 9731; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] 9732; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 9733; AVX-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7] 9734; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] 9735; AVX-NEXT: vbroadcastss {{.*#+}} xmm7 = [0,0,7,14,0,0,7,14,0,0,7,14,0,0,7,14] 9736; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 9737; AVX-NEXT: vpshufb %xmm7, %xmm15, %xmm10 9738; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] 9739; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 9740; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6],xmm0[7] 9741; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [0,128,128,128,128,128,3,10,0,128,128,128,128,128,3,10] 9742; AVX-NEXT: # xmm2 = mem[0,0] 9743; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 9744; AVX-NEXT: vpshufb %xmm2, %xmm4, %xmm11 9745; AVX-NEXT: vpor %xmm11, %xmm10, %xmm10 9746; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 9747; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] 9748; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm10, %ymm10 9749; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 9750; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload 9751; AVX-NEXT: vandps %ymm14, %ymm10, %ymm10 9752; AVX-NEXT: vorps %ymm0, %ymm10, %ymm0 9753; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 9754; AVX-NEXT: vandnps %ymm1, %ymm12, %ymm1 9755; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0 9756; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 9757; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9758; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 9759; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] 9760; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] 9761; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 9762; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 9763; AVX-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] 9764; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 9765; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] 9766; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 9767; AVX-NEXT: vpshufb %xmm7, %xmm11, %xmm7 9768; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] 9769; AVX-NEXT: vpxor %xmm7, %xmm7, %xmm7 9770; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm7[7] 9771; AVX-NEXT: vpshufb %xmm2, %xmm9, %xmm2 9772; AVX-NEXT: vpor %xmm2, %xmm1, %xmm1 9773; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] 9774; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 9775; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload 9776; AVX-NEXT: vandps %ymm1, %ymm14, %ymm1 9777; AVX-NEXT: vorps %ymm2, %ymm1, %ymm1 9778; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 9779; AVX-NEXT: vandnps %ymm0, %ymm12, %ymm0 9780; AVX-NEXT: vandps %ymm1, %ymm12, %ymm1 9781; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 9782; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9783; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] 9784; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9785; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 9786; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] 9787; AVX-NEXT: vpshufb %xmm1, %xmm6, %xmm1 9788; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 9789; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 9790; AVX-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] 9791; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 9792; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] 9793; AVX-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,1,8,15,0,1,8,15,0,1,8,15,0,1,8,15] 9794; AVX-NEXT: vpshufb %xmm14, %xmm15, %xmm6 9795; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] 9796; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm7[7] 9797; AVX-NEXT: vmovddup {{.*#+}} xmm6 = [0,128,128,128,128,128,4,11,0,128,128,128,128,128,4,11] 9798; AVX-NEXT: # xmm6 = mem[0,0] 9799; AVX-NEXT: vmovdqa %xmm4, %xmm12 9800; AVX-NEXT: vpshufb %xmm6, %xmm4, %xmm7 9801; AVX-NEXT: vpor %xmm7, %xmm1, %xmm1 9802; AVX-NEXT: vmovd {{.*#+}} xmm9 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 9803; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm7 9804; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm7 9805; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 9806; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload 9807; AVX-NEXT: vandnps %ymm7, %ymm1, %ymm7 9808; AVX-NEXT: vorps %ymm7, %ymm8, %ymm7 9809; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm8 9810; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] 9811; AVX-NEXT: vandnps %ymm8, %ymm15, %ymm8 9812; AVX-NEXT: vandps %ymm7, %ymm15, %ymm7 9813; AVX-NEXT: vorps %ymm7, %ymm8, %ymm0 9814; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9815; AVX-NEXT: vpshufb %xmm2, %xmm13, %xmm4 9816; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9817; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] 9818; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] 9819; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 9820; AVX-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm3[6,7] 9821; AVX-NEXT: vmovdqa %xmm5, %xmm0 9822; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] 9823; AVX-NEXT: vmovdqa %xmm11, %xmm1 9824; AVX-NEXT: vpshufb %xmm14, %xmm11, %xmm4 9825; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] 9826; AVX-NEXT: vxorps %xmm7, %xmm7, %xmm7 9827; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm7[7] 9828; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 9829; AVX-NEXT: vpshufb %xmm6, %xmm13, %xmm4 9830; AVX-NEXT: vpor %xmm4, %xmm2, %xmm2 9831; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 9832; AVX-NEXT: vpshufb %xmm9, %xmm14, %xmm4 9833; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 9834; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 9835; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload 9836; AVX-NEXT: vandnps %ymm2, %ymm5, %ymm2 9837; AVX-NEXT: vorps %ymm2, %ymm4, %ymm2 9838; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 9839; AVX-NEXT: vandnps %ymm3, %ymm15, %ymm3 9840; AVX-NEXT: vandps %ymm2, %ymm15, %ymm2 9841; AVX-NEXT: vorps %ymm3, %ymm2, %ymm2 9842; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9843; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [0,2,9,128,128,128,0,0,0,2,9,128,128,128,0,0] 9844; AVX-NEXT: # xmm3 = mem[0,0] 9845; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 9846; AVX-NEXT: vpshufb %xmm3, %xmm9, %xmm2 9847; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [0,128,128,0,7,14,0,0,0,128,128,0,7,14,0,0] 9848; AVX-NEXT: # xmm4 = mem[0,0] 9849; AVX-NEXT: vpshufb %xmm4, %xmm10, %xmm5 9850; AVX-NEXT: vpor %xmm2, %xmm5, %xmm2 9851; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm7[7] 9852; AVX-NEXT: vmovddup {{.*#+}} xmm5 = [0,128,128,128,128,128,5,12,0,128,128,128,128,128,5,12] 9853; AVX-NEXT: # xmm5 = mem[0,0] 9854; AVX-NEXT: vpshufb %xmm5, %xmm12, %xmm6 9855; AVX-NEXT: vmovdqa %xmm12, %xmm11 9856; AVX-NEXT: vpor %xmm6, %xmm2, %xmm6 9857; AVX-NEXT: vmovd {{.*#+}} xmm8 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 9858; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 9859; AVX-NEXT: vpshufb %xmm8, %xmm2, %xmm7 9860; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 9861; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 9862; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload 9863; AVX-NEXT: vandnps %ymm6, %ymm12, %ymm6 9864; AVX-NEXT: vorps %ymm6, %ymm7, %ymm6 9865; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload 9866; AVX-NEXT: vandnps %ymm7, %ymm15, %ymm7 9867; AVX-NEXT: vandps %ymm6, %ymm15, %ymm6 9868; AVX-NEXT: vorps %ymm7, %ymm6, %ymm6 9869; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9870; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3 9871; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm4 9872; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 9873; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],mem[7] 9874; AVX-NEXT: vpshufb %xmm5, %xmm13, %xmm4 9875; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3 9876; AVX-NEXT: vpshufb %xmm8, %xmm14, %xmm4 9877; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 9878; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload 9879; AVX-NEXT: vandnps %ymm3, %ymm12, %ymm3 9880; AVX-NEXT: vorps %ymm3, %ymm4, %ymm3 9881; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload 9882; AVX-NEXT: vandnps %ymm4, %ymm15, %ymm4 9883; AVX-NEXT: vandps %ymm3, %ymm15, %ymm3 9884; AVX-NEXT: vorps %ymm4, %ymm3, %ymm0 9885; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9886; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 9887; AVX-NEXT: vmovd {{.*#+}} xmm14 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 9888; AVX-NEXT: vpshufb %xmm14, %xmm8, %xmm3 9889; AVX-NEXT: vmovd {{.*#+}} xmm0 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 9890; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 9891; AVX-NEXT: vpshufb %xmm0, %xmm13, %xmm4 9892; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 9893; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [12,0,0,0,128,128,128,5,12,0,0,0,128,128,128,5] 9894; AVX-NEXT: # xmm4 = mem[0,0] 9895; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 9896; AVX-NEXT: vpshufb %xmm4, %xmm12, %xmm5 9897; AVX-NEXT: vmovddup {{.*#+}} xmm6 = [128,0,0,0,0,7,14,128,128,0,0,0,0,7,14,128] 9898; AVX-NEXT: # xmm6 = mem[0,0] 9899; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9900; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm7 9901; AVX-NEXT: vpor %xmm5, %xmm7, %xmm5 9902; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3,4,5,6,7] 9903; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[u,u] 9904; AVX-NEXT: vmovddup {{.*#+}} xmm5 = [0,128,128,1,8,15,0,0,0,128,128,1,8,15,0,0] 9905; AVX-NEXT: # xmm5 = mem[0,0] 9906; AVX-NEXT: vpshufb %xmm5, %xmm10, %xmm9 9907; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7 9908; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6],mem[7] 9909; AVX-NEXT: vmovddup {{.*#+}} xmm9 = [0,128,128,128,128,128,6,13,0,128,128,128,128,128,6,13] 9910; AVX-NEXT: # xmm9 = mem[0,0] 9911; AVX-NEXT: vpshufb %xmm9, %xmm11, %xmm10 9912; AVX-NEXT: vpor %xmm7, %xmm10, %xmm7 9913; AVX-NEXT: vmovdqa %xmm0, %xmm11 9914; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm10 9915; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 9916; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 9917; AVX-NEXT: vandps %ymm0, %ymm3, %ymm3 9918; AVX-NEXT: vandnps %ymm7, %ymm0, %ymm7 9919; AVX-NEXT: vorps %ymm7, %ymm3, %ymm3 9920; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload 9921; AVX-NEXT: vandnps %ymm7, %ymm15, %ymm7 9922; AVX-NEXT: vandps %ymm3, %ymm15, %ymm3 9923; AVX-NEXT: vorps %ymm7, %ymm3, %ymm0 9924; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9925; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 9926; AVX-NEXT: vpshufb %xmm14, %xmm10, %xmm3 9927; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 9928; AVX-NEXT: vpshufb %xmm11, %xmm14, %xmm7 9929; AVX-NEXT: vmovdqa %xmm11, %xmm2 9930; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] 9931; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 9932; AVX-NEXT: vpshufb %xmm4, %xmm11, %xmm4 9933; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 9934; AVX-NEXT: vpshufb %xmm6, %xmm7, %xmm6 9935; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 9936; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7] 9937; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9938; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm0[u,u] 9939; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 9940; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5 9941; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 9942; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],mem[7] 9943; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 9944; AVX-NEXT: vpshufb %xmm9, %xmm5, %xmm5 9945; AVX-NEXT: vpor %xmm5, %xmm4, %xmm4 9946; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 9947; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm5 9948; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 9949; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 9950; AVX-NEXT: vandps %ymm2, %ymm3, %ymm3 9951; AVX-NEXT: vandnps %ymm4, %ymm2, %ymm4 9952; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3 9953; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload 9954; AVX-NEXT: vandnps %ymm4, %ymm15, %ymm4 9955; AVX-NEXT: vandps %ymm3, %ymm15, %ymm3 9956; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3 9957; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9958; AVX-NEXT: vmovd {{.*#+}} xmm2 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 9959; AVX-NEXT: vpshufb %xmm2, %xmm8, %xmm3 9960; AVX-NEXT: vmovd {{.*#+}} xmm8 = [5,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 9961; AVX-NEXT: vpshufb %xmm8, %xmm13, %xmm4 9962; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 9963; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [13,0,0,0,128,128,128,6,13,0,0,0,128,128,128,6] 9964; AVX-NEXT: # xmm4 = mem[0,0] 9965; AVX-NEXT: vpshufb %xmm4, %xmm12, %xmm5 9966; AVX-NEXT: vmovddup {{.*#+}} xmm6 = [128,0,0,0,1,8,15,128,128,0,0,0,1,8,15,128] 9967; AVX-NEXT: # xmm6 = mem[0,0] 9968; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm7 9969; AVX-NEXT: vpor %xmm5, %xmm7, %xmm5 9970; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3,4,5,6,7] 9971; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9972; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[2,9,u,u,u] 9973; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 9974; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm12[u,u,u] 9975; AVX-NEXT: vpor %xmm3, %xmm9, %xmm9 9976; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero 9977; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [0,128,128,128,128,0,7,14,0,128,128,128,128,0,7,14] 9978; AVX-NEXT: # xmm7 = mem[0,0] 9979; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 9980; AVX-NEXT: vpshufb %xmm7, %xmm1, %xmm13 9981; AVX-NEXT: vpor %xmm13, %xmm9, %xmm9 9982; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 9983; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm13 9984; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 9985; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 9986; AVX-NEXT: vandps %ymm5, %ymm13, %ymm5 9987; AVX-NEXT: vandnps %ymm9, %ymm13, %ymm9 9988; AVX-NEXT: vorps %ymm5, %ymm9, %ymm5 9989; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload 9990; AVX-NEXT: vandnps %ymm9, %ymm15, %ymm9 9991; AVX-NEXT: vandps %ymm5, %ymm15, %ymm5 9992; AVX-NEXT: vorps %ymm5, %ymm9, %ymm1 9993; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9994; AVX-NEXT: vpshufb %xmm2, %xmm10, %xmm2 9995; AVX-NEXT: vpshufb %xmm8, %xmm14, %xmm9 9996; AVX-NEXT: vmovdqa %xmm8, %xmm14 9997; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] 9998; AVX-NEXT: vpshufb %xmm4, %xmm11, %xmm4 9999; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 10000; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm6 10001; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 10002; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3,4,5,6,7] 10003; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9,u,u,u] 10004; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 10005; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm10[u,u,u] 10006; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 10007; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] 10008; AVX-NEXT: # xmm11 = mem[0,0] 10009; AVX-NEXT: vpshufb %xmm11, %xmm4, %xmm4 10010; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 10011; AVX-NEXT: vpshufb %xmm7, %xmm1, %xmm6 10012; AVX-NEXT: vpor %xmm6, %xmm4, %xmm4 10013; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 10014; AVX-NEXT: vpshufb %xmm14, %xmm5, %xmm6 10015; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 10016; AVX-NEXT: vandps %ymm2, %ymm13, %ymm2 10017; AVX-NEXT: vandnps %ymm4, %ymm13, %ymm4 10018; AVX-NEXT: vorps %ymm4, %ymm2, %ymm2 10019; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload 10020; AVX-NEXT: vandnps %ymm4, %ymm15, %ymm4 10021; AVX-NEXT: vandps %ymm2, %ymm15, %ymm2 10022; AVX-NEXT: vorps %ymm4, %ymm2, %ymm0 10023; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10024; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10025; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 10026; AVX-NEXT: vmovd {{.*#+}} xmm2 = [6,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 10027; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10028; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm6 10029; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] 10030; AVX-NEXT: vmovddup {{.*#+}} xmm6 = [128,0,0,0,2,9,128,128,128,0,0,0,2,9,128,128] 10031; AVX-NEXT: # xmm6 = mem[0,0] 10032; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10033; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm7 10034; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [14,0,0,0,128,128,0,7,14,0,0,0,128,128,0,7] 10035; AVX-NEXT: # xmm0 = mem[0,0] 10036; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 10037; AVX-NEXT: vpshufb %xmm0, %xmm9, %xmm9 10038; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7 10039; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3,4,5,6,7] 10040; AVX-NEXT: vbroadcastss {{.*#+}} xmm7 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3] 10041; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 10042; AVX-NEXT: vpshufb %xmm7, %xmm9, %xmm9 10043; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm12[u,u,u] 10044; AVX-NEXT: vpor %xmm9, %xmm13, %xmm9 10045; AVX-NEXT: vpshufb %xmm11, %xmm9, %xmm9 10046; AVX-NEXT: vmovddup {{.*#+}} xmm13 = [0,128,128,128,128,1,8,15,0,128,128,128,128,1,8,15] 10047; AVX-NEXT: # xmm13 = mem[0,0] 10048; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 10049; AVX-NEXT: vpshufb %xmm13, %xmm11, %xmm14 10050; AVX-NEXT: vpor %xmm14, %xmm9, %xmm9 10051; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm14 10052; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm9, %ymm9 10053; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 10054; AVX-NEXT: vandps %ymm4, %ymm11, %ymm4 10055; AVX-NEXT: vandnps %ymm9, %ymm11, %ymm9 10056; AVX-NEXT: vorps %ymm4, %ymm9, %ymm4 10057; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload 10058; AVX-NEXT: vandnps %ymm9, %ymm15, %ymm9 10059; AVX-NEXT: vandps %ymm4, %ymm15, %ymm4 10060; AVX-NEXT: vorps %ymm4, %ymm9, %ymm4 10061; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 10062; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 10063; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 10064; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm12 10065; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] 10066; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm6 10067; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 10068; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm8 10069; AVX-NEXT: vpor %xmm6, %xmm8, %xmm6 10070; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3,4,5,6,7] 10071; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 10072; AVX-NEXT: vpshufb %xmm7, %xmm0, %xmm7 10073; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm10[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm10[u,u,u] 10074; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7 10075; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero 10076; AVX-NEXT: vpshufb %xmm13, %xmm1, %xmm7 10077; AVX-NEXT: vpor %xmm7, %xmm3, %xmm3 10078; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm2 10079; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 10080; AVX-NEXT: vandps %ymm6, %ymm11, %ymm3 10081; AVX-NEXT: vandnps %ymm2, %ymm11, %ymm1 10082; AVX-NEXT: vorps %ymm1, %ymm3, %ymm1 10083; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm2 # 16-byte Folded Reload 10084; AVX-NEXT: vandnps %ymm2, %ymm15, %ymm2 10085; AVX-NEXT: vandps %ymm1, %ymm15, %ymm0 10086; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0 10087; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10088; AVX-NEXT: vmovaps %ymm1, 32(%rsi) 10089; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10090; AVX-NEXT: vmovaps %ymm1, (%rsi) 10091; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10092; AVX-NEXT: vmovaps %ymm1, 32(%rdx) 10093; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10094; AVX-NEXT: vmovaps %ymm1, (%rdx) 10095; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10096; AVX-NEXT: vmovaps %ymm1, 32(%rcx) 10097; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10098; AVX-NEXT: vmovaps %ymm1, (%rcx) 10099; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10100; AVX-NEXT: vmovaps %ymm1, 32(%r8) 10101; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10102; AVX-NEXT: vmovaps %ymm1, (%r8) 10103; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10104; AVX-NEXT: vmovaps %ymm1, 32(%r9) 10105; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10106; AVX-NEXT: vmovaps %ymm1, (%r9) 10107; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 10108; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10109; AVX-NEXT: vmovaps %ymm1, 32(%rax) 10110; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10111; AVX-NEXT: vmovaps %ymm1, (%rax) 10112; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 10113; AVX-NEXT: vmovaps %ymm0, 32(%rax) 10114; AVX-NEXT: vmovaps %ymm4, (%rax) 10115; AVX-NEXT: addq $744, %rsp # imm = 0x2E8 10116; AVX-NEXT: vzeroupper 10117; AVX-NEXT: retq 10118; 10119; AVX2-LABEL: load_i8_stride7_vf64: 10120; AVX2: # %bb.0: 10121; AVX2-NEXT: subq $760, %rsp # imm = 0x2F8 10122; AVX2-NEXT: vmovdqa 320(%rdi), %ymm6 10123; AVX2-NEXT: vmovdqa 224(%rdi), %ymm7 10124; AVX2-NEXT: vmovdqa 256(%rdi), %ymm8 10125; AVX2-NEXT: vmovdqa (%rdi), %ymm12 10126; AVX2-NEXT: vmovdqa 32(%rdi), %ymm10 10127; AVX2-NEXT: vmovdqa 64(%rdi), %ymm11 10128; AVX2-NEXT: vmovdqa 96(%rdi), %ymm5 10129; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 10130; AVX2-NEXT: vpblendvb %ymm13, %ymm12, %ymm10, %ymm0 10131; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10132; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10133; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 10134; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u] 10135; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm3 10136; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u] 10137; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 10138; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0 10139; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 10140; AVX2-NEXT: vpblendvb %ymm14, %ymm5, %ymm11, %ymm3 10141; AVX2-NEXT: vmovdqa %ymm5, %ymm9 10142; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10143; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10144; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 10145; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] 10146; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] 10147; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 10148; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm5 10149; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0] 10150; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 10151; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10152; AVX2-NEXT: vpblendvb %ymm13, %ymm7, %ymm8, %ymm5 10153; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10154; AVX2-NEXT: vmovdqa %ymm7, %ymm0 10155; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10156; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7 10157; AVX2-NEXT: vpshufb %xmm1, %xmm7, %xmm1 10158; AVX2-NEXT: vmovdqa 288(%rdi), %ymm15 10159; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm2 10160; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 10161; AVX2-NEXT: vpblendvb %ymm14, %ymm6, %ymm15, %ymm2 10162; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10163; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10164; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5 10165; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] 10166; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 10167; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 10168; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10169; AVX2-NEXT: vpblendvb %ymm13, %ymm11, %ymm9, %ymm2 10170; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 10171; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] 10172; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 10173; AVX2-NEXT: vpblendvb %ymm1, %ymm12, %ymm10, %ymm5 10174; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm3 10175; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u] 10176; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm7 10177; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u] 10178; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5 10179; AVX2-NEXT: vpor %xmm7, %xmm5, %xmm5 10180; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] 10181; AVX2-NEXT: # ymm7 = mem[0,1,0,1] 10182; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm4 10183; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm9 = [0,18446744073709551360,16777215,0] 10184; AVX2-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 10185; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10186; AVX2-NEXT: vpblendvb %ymm13, %ymm15, %ymm6, %ymm4 10187; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 10188; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7,8,9],ymm5[10],ymm4[11,12,13],ymm5[14],ymm4[15] 10189; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm4 10190; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm5 10191; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7 10192; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm2 10193; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm3 10194; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 10195; AVX2-NEXT: vpblendvb %ymm9, %ymm4, %ymm2, %ymm0 10196; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10197; AVX2-NEXT: vmovdqa 160(%rdi), %ymm9 10198; AVX2-NEXT: vmovdqa 128(%rdi), %ymm15 10199; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 10200; AVX2-NEXT: vpblendvb %ymm4, %ymm9, %ymm15, %ymm2 10201; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10202; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 10203; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,128,128,3,10,128,128,128,6,13,u,u,u,u] 10204; AVX2-NEXT: vpshufb %xmm8, %xmm3, %xmm3 10205; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,5,12,128,128,1,8,15,128,128,u,u,u,u] 10206; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2 10207; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 10208; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] 10209; AVX2-NEXT: vmovdqa 208(%rdi), %xmm5 10210; AVX2-NEXT: vpshufb %xmm12, %xmm5, %xmm3 10211; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] 10212; AVX2-NEXT: vmovdqa 192(%rdi), %xmm1 10213; AVX2-NEXT: vpshufb %xmm11, %xmm1, %xmm10 10214; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10215; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] 10216; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 10217; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 10218; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 10219; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm14 = [18446744073709551615,18446744073709551615,16777215,0] 10220; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload 10221; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10222; AVX2-NEXT: vmovdqa 384(%rdi), %ymm2 10223; AVX2-NEXT: vmovdqa 352(%rdi), %ymm3 10224; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm0 10225; AVX2-NEXT: vmovdqa %ymm2, %ymm4 10226; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm6 10227; AVX2-NEXT: vpshufb %xmm8, %xmm6, %xmm6 10228; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm0 10229; AVX2-NEXT: vpor %xmm6, %xmm0, %xmm0 10230; AVX2-NEXT: vmovdqa 432(%rdi), %xmm13 10231; AVX2-NEXT: vpshufb %xmm12, %xmm13, %xmm6 10232; AVX2-NEXT: vmovdqa 416(%rdi), %xmm2 10233; AVX2-NEXT: vpshufb %xmm11, %xmm2, %xmm8 10234; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] 10235; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 10236; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 10237; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] 10238; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10239; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10240; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 10241; AVX2-NEXT: vpblendvb %ymm7, %ymm9, %ymm15, %ymm0 10242; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,6,13,128,128,2,9,128,128,128,u,u,u,u] 10243; AVX2-NEXT: vpshufb %xmm10, %xmm0, %xmm6 10244; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 10245; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,128,128,4,11,128,128,0,7,14,u,u,u,u] 10246; AVX2-NEXT: vpshufb %xmm8, %xmm0, %xmm0 10247; AVX2-NEXT: vpor %xmm6, %xmm0, %xmm0 10248; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] 10249; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm9 10250; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm12 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12] 10251; AVX2-NEXT: vpshufb %xmm12, %xmm1, %xmm11 10252; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] 10253; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 10254; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 10255; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7] 10256; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10257; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10258; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm0 10259; AVX2-NEXT: vmovdqa %ymm3, %ymm11 10260; AVX2-NEXT: vpshufb %xmm10, %xmm0, %xmm1 10261; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 10262; AVX2-NEXT: vpshufb %xmm8, %xmm0, %xmm0 10263; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 10264; AVX2-NEXT: vpshufb %xmm6, %xmm13, %xmm1 10265; AVX2-NEXT: vpshufb %xmm12, %xmm2, %xmm6 10266; AVX2-NEXT: vmovdqa %xmm2, %xmm12 10267; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] 10268; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 10269; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10270; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] 10271; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10272; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10273; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 10274; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm0 10275; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10276; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 10277; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,128,128,128,6,13,128,128,2,9,u,u,u,u,u] 10278; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1 10279; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,1,8,15,128,128,4,11,128,128,u,u,u,u,u] 10280; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm0 10281; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 10282; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] 10283; AVX2-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10284; AVX2-NEXT: vpshufb %xmm1, %xmm13, %xmm8 10285; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] 10286; AVX2-NEXT: vpshufb %xmm9, %xmm12, %xmm10 10287; AVX2-NEXT: vmovdqa %xmm12, %xmm3 10288; AVX2-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10289; AVX2-NEXT: vpor %xmm8, %xmm10, %xmm8 10290; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 10291; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 10292; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] 10293; AVX2-NEXT: # ymm0 = mem[0,1,0,1] 10294; AVX2-NEXT: vpblendvb %ymm0, %ymm10, %ymm8, %ymm8 10295; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10296; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 10297; AVX2-NEXT: vpblendvb %ymm2, %ymm15, %ymm12, %ymm8 10298; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm10 10299; AVX2-NEXT: vpshufb %xmm6, %xmm10, %xmm6 10300; AVX2-NEXT: vpshufb %xmm7, %xmm8, %xmm7 10301; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 10302; AVX2-NEXT: vpshufb %xmm1, %xmm5, %xmm1 10303; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 10304; AVX2-NEXT: vpshufb %xmm9, %xmm14, %xmm7 10305; AVX2-NEXT: vpor %xmm1, %xmm7, %xmm1 10306; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 10307; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10308; AVX2-NEXT: vpblendvb %ymm0, %ymm6, %ymm1, %ymm1 10309; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10310; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 10311; AVX2-NEXT: vpblendvb %ymm2, %ymm11, %ymm4, %ymm1 10312; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,2,9,128,128,128,5,12,128,128,u,u,u,u,u] 10313; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7 10314; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 10315; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u] 10316; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm1 10317; AVX2-NEXT: vpor %xmm7, %xmm1, %xmm1 10318; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] 10319; AVX2-NEXT: vpshufb %xmm7, %xmm13, %xmm9 10320; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] 10321; AVX2-NEXT: vpshufb %xmm10, %xmm3, %xmm11 10322; AVX2-NEXT: vpor %xmm9, %xmm11, %xmm9 10323; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10324; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 10325; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 10326; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10327; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10328; AVX2-NEXT: vpblendvb %ymm2, %ymm15, %ymm12, %ymm1 10329; AVX2-NEXT: vmovdqa %ymm12, %ymm2 10330; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm6 10331; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 10332; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm1 10333; AVX2-NEXT: vpor %xmm6, %xmm1, %xmm1 10334; AVX2-NEXT: vpshufb %xmm7, %xmm5, %xmm6 10335; AVX2-NEXT: vmovdqa %xmm5, %xmm13 10336; AVX2-NEXT: vpshufb %xmm10, %xmm14, %xmm7 10337; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 10338; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10339; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 10340; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 10341; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10342; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] 10343; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 10344; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm1 10345; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,3,10,128,128,128,6,13,128,128,u,u,u,u,u] 10346; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm7 10347; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 10348; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u] 10349; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm1 10350; AVX2-NEXT: vpor %xmm7, %xmm1, %xmm1 10351; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] 10352; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 10353; AVX2-NEXT: vpshufb %xmm7, %xmm12, %xmm9 10354; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] 10355; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 10356; AVX2-NEXT: vpshufb %xmm10, %xmm14, %xmm11 10357; AVX2-NEXT: vpor %xmm9, %xmm11, %xmm9 10358; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10359; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 10360; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 10361; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10362; AVX2-NEXT: vpblendvb %ymm6, %ymm15, %ymm2, %ymm1 10363; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 10364; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 10365; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm1 10366; AVX2-NEXT: vpor %xmm6, %xmm1, %xmm1 10367; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 10368; AVX2-NEXT: vpshufb %xmm7, %xmm11, %xmm6 10369; AVX2-NEXT: vpshufb %xmm10, %xmm13, %xmm7 10370; AVX2-NEXT: vmovdqa %xmm13, (%rsp) # 16-byte Spill 10371; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 10372; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10373; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 10374; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 10375; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10376; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 10377; AVX2-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm2 10378; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10379; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 10380; AVX2-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm1 10381; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 10382; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,128,128,2,9,128,128,128,5,12,u,u,u,u,u] 10383; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 10384; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,4,11,128,128,0,7,14,128,128,u,u,u,u,u] 10385; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1 10386; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 10387; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] 10388; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm7 10389; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm8 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] 10390; AVX2-NEXT: vpshufb %xmm8, %xmm14, %xmm9 10391; AVX2-NEXT: vpor %xmm7, %xmm9, %xmm7 10392; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10393; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 10394; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm7, %ymm1 10395; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10396; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10397; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 10398; AVX2-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm5 10399; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10400; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm4, %ymm1 10401; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 10402; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3 10403; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1 10404; AVX2-NEXT: vpor %xmm3, %xmm1, %xmm1 10405; AVX2-NEXT: vpshufb %xmm2, %xmm11, %xmm2 10406; AVX2-NEXT: vpshufb %xmm8, %xmm13, %xmm3 10407; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 10408; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10409; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 10410; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 10411; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10412; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10413; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10414; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 10415; AVX2-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm1 10416; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] 10417; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm12 10418; AVX2-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm13 10419; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 10420; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm3 10421; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10422; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 10423; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 10424; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10425; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10426; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10427; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm3 10428; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10429; AVX2-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm3 10430; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm14 10431; AVX2-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm4 10432; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10433; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm0 10434; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10435; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10436; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 10437; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 10438; AVX2-NEXT: vpblendvb %ymm10, %ymm6, %ymm2, %ymm0 10439; AVX2-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm4 10440; AVX2-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm9 10441; AVX2-NEXT: vpblendvb %ymm15, %ymm2, %ymm6, %ymm8 10442; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10443; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm6, %ymm2 10444; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10445; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10446; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 10447; AVX2-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm2 10448; AVX2-NEXT: vpblendvb %ymm11, %ymm7, %ymm8, %ymm6 10449; AVX2-NEXT: vpblendvb %ymm5, %ymm7, %ymm8, %ymm10 10450; AVX2-NEXT: vpblendvb %ymm15, %ymm8, %ymm7, %ymm5 10451; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10452; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 10453; AVX2-NEXT: vpblendvb %ymm5, %ymm8, %ymm7, %ymm11 10454; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u] 10455; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm15 10456; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 10457; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u] 10458; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm1 10459; AVX2-NEXT: vpor %xmm1, %xmm15, %xmm1 10460; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm15 10461; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8,9,10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15] 10462; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] 10463; AVX2-NEXT: # ymm5 = mem[0,1,0,1] 10464; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0 10465; AVX2-NEXT: vpmovsxwq {{.*#+}} xmm15 = [18446744073709551615,255] 10466; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 10467; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10468; AVX2-NEXT: vpshufb %xmm8, %xmm3, %xmm0 10469; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 10470; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm1 10471; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 10472; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 10473; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] 10474; AVX2-NEXT: vpshufb %ymm5, %ymm1, %ymm1 10475; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 10476; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10477; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u] 10478; AVX2-NEXT: vpshufb %xmm0, %xmm12, %xmm1 10479; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm2 10480; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u] 10481; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 10482; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 10483; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm2 10484; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5,6],ymm2[7,8],ymm4[9,10],ymm2[11],ymm4[12,13,14],ymm2[15] 10485; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] 10486; AVX2-NEXT: # ymm4 = mem[0,1,0,1] 10487; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2 10488; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm12 10489; AVX2-NEXT: vpshufb %xmm0, %xmm14, %xmm0 10490; AVX2-NEXT: vextracti128 $1, %ymm14, %xmm1 10491; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 10492; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 10493; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm1 10494; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4,5,6],ymm1[7,8],ymm6[9,10],ymm1[11],ymm6[12,13,14],ymm1[15] 10495; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1 10496; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm14 10497; AVX2-NEXT: vextracti128 $1, %ymm13, %xmm0 10498; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u] 10499; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 10500; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u] 10501; AVX2-NEXT: vpshufb %xmm2, %xmm13, %xmm3 10502; AVX2-NEXT: vpor %xmm0, %xmm3, %xmm0 10503; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm3 10504; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2,3],ymm3[4],ymm9[5,6],ymm3[7,8],ymm9[9,10,11],ymm3[12],ymm9[13,14],ymm3[15] 10505; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13] 10506; AVX2-NEXT: # ymm4 = mem[0,1,0,1] 10507; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3 10508; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm3, %ymm6 10509; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 10510; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm0 10511; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 10512; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm1 10513; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 10514; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm1 10515; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm10[1,2,3],ymm1[4],ymm10[5,6],ymm1[7,8],ymm10[9,10,11],ymm1[12],ymm10[13,14],ymm1[15] 10516; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1 10517; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 10518; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10519; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 10520; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u] 10521; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 10522; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u] 10523; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 10524; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 10525; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 10526; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm2 10527; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6,7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14,15] 10528; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] 10529; AVX2-NEXT: # ymm5 = mem[0,1,0,1] 10530; AVX2-NEXT: vpshufb %ymm5, %ymm2, %ymm2 10531; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm2 10532; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10533; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm1 10534; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 10535; AVX2-NEXT: vpshufb %xmm4, %xmm7, %xmm3 10536; AVX2-NEXT: vpor %xmm1, %xmm3, %xmm1 10537; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 10538; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm3 10539; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] 10540; AVX2-NEXT: vpshufb %ymm5, %ymm3, %ymm3 10541; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm3, %ymm1 10542; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u] 10543; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 10544; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm3 10545; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 10546; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u] 10547; AVX2-NEXT: vpshufb %xmm7, %xmm5, %xmm5 10548; AVX2-NEXT: vpor %xmm3, %xmm5, %xmm3 10549; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 10550; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm5 10551; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7,8],ymm5[9],ymm8[10,11,12],ymm5[13],ymm8[14,15] 10552; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] 10553; AVX2-NEXT: # ymm8 = mem[0,1,0,1] 10554; AVX2-NEXT: vpshufb %ymm8, %ymm5, %ymm5 10555; AVX2-NEXT: vpblendvb %ymm15, %ymm3, %ymm5, %ymm3 10556; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 10557; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm4 10558; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 10559; AVX2-NEXT: vpshufb %xmm7, %xmm5, %xmm5 10560; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 10561; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm5 10562; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0],ymm5[1],ymm11[2,3,4],ymm5[5],ymm11[6,7,8],ymm5[9],ymm11[10,11,12],ymm5[13],ymm11[14,15] 10563; AVX2-NEXT: vpshufb %ymm8, %ymm5, %ymm5 10564; AVX2-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 10565; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 10566; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm5 10567; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,128,128,128,5,12,128,128,1,8,15,u,u,u,u] 10568; AVX2-NEXT: vpshufb %xmm7, %xmm5, %xmm5 10569; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,0,7,14,128,128,3,10,128,128,128,u,u,u,u] 10570; AVX2-NEXT: vpshufb %xmm8, %xmm9, %xmm9 10571; AVX2-NEXT: vpor %xmm5, %xmm9, %xmm5 10572; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] 10573; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 10574; AVX2-NEXT: vpshufb %xmm13, %xmm9, %xmm9 10575; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13] 10576; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 10577; AVX2-NEXT: vpshufb %xmm10, %xmm11, %xmm11 10578; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] 10579; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 10580; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 10581; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7] 10582; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 10583; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1,2,3,4,5,6,7],ymm9[8],ymm5[9,10,11,12,13,14,15] 10584; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] 10585; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 10586; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm9 10587; AVX2-NEXT: vpshufb %xmm7, %xmm9, %xmm7 10588; AVX2-NEXT: vpshufb %xmm8, %xmm11, %xmm8 10589; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7 10590; AVX2-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload 10591; AVX2-NEXT: vpshufb %xmm13, %xmm8, %xmm8 10592; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 10593; AVX2-NEXT: vpshufb %xmm10, %xmm9, %xmm9 10594; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] 10595; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 10596; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 10597; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] 10598; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 10599; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] 10600; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] 10601; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload 10602; AVX2-NEXT: # ymm8 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15] 10603; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] 10604; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload 10605; AVX2-NEXT: # ymm9 = ymm14[0],mem[1,2,3,4,5,6,7],ymm14[8],mem[9,10,11,12,13,14,15] 10606; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] 10607; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload 10608; AVX2-NEXT: # ymm10 = ymm6[0],mem[1,2,3,4,5,6,7],ymm6[8],mem[9,10,11,12,13,14,15] 10609; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] 10610; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload 10611; AVX2-NEXT: # ymm10 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] 10612; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] 10613; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload 10614; AVX2-NEXT: # ymm10 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] 10615; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] 10616; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload 10617; AVX2-NEXT: # ymm10 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] 10618; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] 10619; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload 10620; AVX2-NEXT: # ymm10 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] 10621; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] 10622; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload 10623; AVX2-NEXT: # ymm10 = ymm4[0],mem[1,2,3,4,5,6,7],ymm4[8],mem[9,10,11,12,13,14,15] 10624; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] 10625; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 10626; AVX2-NEXT: vmovaps %ymm10, 32(%rsi) 10627; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 10628; AVX2-NEXT: vmovaps %ymm10, (%rsi) 10629; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 10630; AVX2-NEXT: vmovaps %ymm10, 32(%rdx) 10631; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 10632; AVX2-NEXT: vmovaps %ymm10, (%rdx) 10633; AVX2-NEXT: vmovdqa %ymm5, 32(%rcx) 10634; AVX2-NEXT: vmovdqa %ymm7, (%rcx) 10635; AVX2-NEXT: vmovdqa %ymm8, 32(%r8) 10636; AVX2-NEXT: vmovdqa %ymm9, (%r8) 10637; AVX2-NEXT: vmovdqa %ymm6, 32(%r9) 10638; AVX2-NEXT: vmovdqa %ymm0, (%r9) 10639; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 10640; AVX2-NEXT: vmovdqa %ymm2, 32(%rax) 10641; AVX2-NEXT: vmovdqa %ymm1, (%rax) 10642; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 10643; AVX2-NEXT: vmovdqa %ymm3, 32(%rax) 10644; AVX2-NEXT: vmovdqa %ymm4, (%rax) 10645; AVX2-NEXT: addq $760, %rsp # imm = 0x2F8 10646; AVX2-NEXT: vzeroupper 10647; AVX2-NEXT: retq 10648; 10649; AVX2-FP-LABEL: load_i8_stride7_vf64: 10650; AVX2-FP: # %bb.0: 10651; AVX2-FP-NEXT: subq $760, %rsp # imm = 0x2F8 10652; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm6 10653; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm7 10654; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm8 10655; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm12 10656; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm10 10657; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm11 10658; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm5 10659; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 10660; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm12, %ymm10, %ymm0 10661; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10662; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10663; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2 10664; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u] 10665; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm3 10666; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u] 10667; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 10668; AVX2-FP-NEXT: vpor %xmm3, %xmm0, %xmm0 10669; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 10670; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm5, %ymm11, %ymm3 10671; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm9 10672; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10673; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10674; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 10675; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] 10676; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] 10677; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] 10678; AVX2-FP-NEXT: vpshufb %ymm3, %ymm4, %ymm5 10679; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0] 10680; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 10681; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10682; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm7, %ymm8, %ymm5 10683; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10684; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm0 10685; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10686; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm7 10687; AVX2-FP-NEXT: vpshufb %xmm1, %xmm7, %xmm1 10688; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm15 10689; AVX2-FP-NEXT: vpshufb %xmm2, %xmm5, %xmm2 10690; AVX2-FP-NEXT: vpor %xmm1, %xmm2, %xmm1 10691; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm6, %ymm15, %ymm2 10692; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10693; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10694; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm5 10695; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] 10696; AVX2-FP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 10697; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 10698; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10699; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm9, %ymm2 10700; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 10701; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] 10702; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 10703; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm12, %ymm10, %ymm5 10704; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm3 10705; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u] 10706; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm7 10707; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u] 10708; AVX2-FP-NEXT: vpshufb %xmm3, %xmm5, %xmm5 10709; AVX2-FP-NEXT: vpor %xmm7, %xmm5, %xmm5 10710; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] 10711; AVX2-FP-NEXT: # ymm7 = mem[0,1,0,1] 10712; AVX2-FP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 10713; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm9 = [0,18446744073709551360,16777215,0] 10714; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 10715; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10716; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm15, %ymm6, %ymm4 10717; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5 10718; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7,8,9],ymm5[10],ymm4[11,12,13],ymm5[14],ymm4[15] 10719; AVX2-FP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 10720; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm5 10721; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm7 10722; AVX2-FP-NEXT: vpshufb %xmm2, %xmm7, %xmm2 10723; AVX2-FP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 10724; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2 10725; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm4, %ymm2, %ymm0 10726; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10727; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm9 10728; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm15 10729; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 10730; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm9, %ymm15, %ymm2 10731; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10732; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 10733; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,128,128,3,10,128,128,128,6,13,u,u,u,u] 10734; AVX2-FP-NEXT: vpshufb %xmm8, %xmm3, %xmm3 10735; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,5,12,128,128,1,8,15,128,128,u,u,u,u] 10736; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 10737; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 10738; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] 10739; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm5 10740; AVX2-FP-NEXT: vpshufb %xmm12, %xmm5, %xmm3 10741; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] 10742; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm1 10743; AVX2-FP-NEXT: vpshufb %xmm11, %xmm1, %xmm10 10744; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10745; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] 10746; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 10747; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 10748; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 10749; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm14 = [18446744073709551615,18446744073709551615,16777215,0] 10750; AVX2-FP-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload 10751; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10752; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm2 10753; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm3 10754; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm0 10755; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm4 10756; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm6 10757; AVX2-FP-NEXT: vpshufb %xmm8, %xmm6, %xmm6 10758; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 10759; AVX2-FP-NEXT: vpor %xmm6, %xmm0, %xmm0 10760; AVX2-FP-NEXT: vmovdqa 432(%rdi), %xmm13 10761; AVX2-FP-NEXT: vpshufb %xmm12, %xmm13, %xmm6 10762; AVX2-FP-NEXT: vmovdqa 416(%rdi), %xmm2 10763; AVX2-FP-NEXT: vpshufb %xmm11, %xmm2, %xmm8 10764; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] 10765; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 10766; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 10767; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] 10768; AVX2-FP-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10769; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10770; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 10771; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm9, %ymm15, %ymm0 10772; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,6,13,128,128,2,9,128,128,128,u,u,u,u] 10773; AVX2-FP-NEXT: vpshufb %xmm10, %xmm0, %xmm6 10774; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0 10775; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,128,128,4,11,128,128,0,7,14,u,u,u,u] 10776; AVX2-FP-NEXT: vpshufb %xmm8, %xmm0, %xmm0 10777; AVX2-FP-NEXT: vpor %xmm6, %xmm0, %xmm0 10778; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] 10779; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm9 10780; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12] 10781; AVX2-FP-NEXT: vpshufb %xmm12, %xmm1, %xmm11 10782; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] 10783; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 10784; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 10785; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7] 10786; AVX2-FP-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10787; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10788; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm0 10789; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm11 10790; AVX2-FP-NEXT: vpshufb %xmm10, %xmm0, %xmm1 10791; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0 10792; AVX2-FP-NEXT: vpshufb %xmm8, %xmm0, %xmm0 10793; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 10794; AVX2-FP-NEXT: vpshufb %xmm6, %xmm13, %xmm1 10795; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm6 10796; AVX2-FP-NEXT: vmovdqa %xmm2, %xmm12 10797; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] 10798; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 10799; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10800; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] 10801; AVX2-FP-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 10802; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10803; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 10804; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm0 10805; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10806; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 10807; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,128,128,128,6,13,128,128,2,9,u,u,u,u,u] 10808; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 10809; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,1,8,15,128,128,4,11,128,128,u,u,u,u,u] 10810; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 10811; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 10812; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] 10813; AVX2-FP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10814; AVX2-FP-NEXT: vpshufb %xmm1, %xmm13, %xmm8 10815; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] 10816; AVX2-FP-NEXT: vpshufb %xmm9, %xmm12, %xmm10 10817; AVX2-FP-NEXT: vmovdqa %xmm12, %xmm3 10818; AVX2-FP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 10819; AVX2-FP-NEXT: vpor %xmm8, %xmm10, %xmm8 10820; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 10821; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 10822; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] 10823; AVX2-FP-NEXT: # ymm0 = mem[0,1,0,1] 10824; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm10, %ymm8, %ymm8 10825; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10826; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 10827; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm15, %ymm12, %ymm8 10828; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm10 10829; AVX2-FP-NEXT: vpshufb %xmm6, %xmm10, %xmm6 10830; AVX2-FP-NEXT: vpshufb %xmm7, %xmm8, %xmm7 10831; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 10832; AVX2-FP-NEXT: vpshufb %xmm1, %xmm5, %xmm1 10833; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 10834; AVX2-FP-NEXT: vpshufb %xmm9, %xmm14, %xmm7 10835; AVX2-FP-NEXT: vpor %xmm1, %xmm7, %xmm1 10836; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 10837; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10838; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm6, %ymm1, %ymm1 10839; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10840; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 10841; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm11, %ymm4, %ymm1 10842; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,2,9,128,128,128,5,12,128,128,u,u,u,u,u] 10843; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm7 10844; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1 10845; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u] 10846; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 10847; AVX2-FP-NEXT: vpor %xmm7, %xmm1, %xmm1 10848; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] 10849; AVX2-FP-NEXT: vpshufb %xmm7, %xmm13, %xmm9 10850; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] 10851; AVX2-FP-NEXT: vpshufb %xmm10, %xmm3, %xmm11 10852; AVX2-FP-NEXT: vpor %xmm9, %xmm11, %xmm9 10853; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10854; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 10855; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 10856; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10857; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10858; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm15, %ymm12, %ymm1 10859; AVX2-FP-NEXT: vmovdqa %ymm12, %ymm2 10860; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 10861; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1 10862; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 10863; AVX2-FP-NEXT: vpor %xmm6, %xmm1, %xmm1 10864; AVX2-FP-NEXT: vpshufb %xmm7, %xmm5, %xmm6 10865; AVX2-FP-NEXT: vmovdqa %xmm5, %xmm13 10866; AVX2-FP-NEXT: vpshufb %xmm10, %xmm14, %xmm7 10867; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 10868; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10869; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 10870; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 10871; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10872; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] 10873; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 10874; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm1 10875; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,3,10,128,128,128,6,13,128,128,u,u,u,u,u] 10876; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm7 10877; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1 10878; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u] 10879; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 10880; AVX2-FP-NEXT: vpor %xmm7, %xmm1, %xmm1 10881; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] 10882; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 10883; AVX2-FP-NEXT: vpshufb %xmm7, %xmm12, %xmm9 10884; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] 10885; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 10886; AVX2-FP-NEXT: vpshufb %xmm10, %xmm14, %xmm11 10887; AVX2-FP-NEXT: vpor %xmm9, %xmm11, %xmm9 10888; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10889; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 10890; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 10891; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10892; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm15, %ymm2, %ymm1 10893; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 10894; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1 10895; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 10896; AVX2-FP-NEXT: vpor %xmm6, %xmm1, %xmm1 10897; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 10898; AVX2-FP-NEXT: vpshufb %xmm7, %xmm11, %xmm6 10899; AVX2-FP-NEXT: vpshufb %xmm10, %xmm13, %xmm7 10900; AVX2-FP-NEXT: vmovdqa %xmm13, (%rsp) # 16-byte Spill 10901; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 10902; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10903; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 10904; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1 10905; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10906; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 10907; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm2 10908; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10909; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 10910; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm1 10911; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 10912; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,128,128,2,9,128,128,128,5,12,u,u,u,u,u] 10913; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 10914; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,4,11,128,128,0,7,14,128,128,u,u,u,u,u] 10915; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 10916; AVX2-FP-NEXT: vpor %xmm2, %xmm1, %xmm1 10917; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] 10918; AVX2-FP-NEXT: vpshufb %xmm2, %xmm12, %xmm7 10919; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm8 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] 10920; AVX2-FP-NEXT: vpshufb %xmm8, %xmm14, %xmm9 10921; AVX2-FP-NEXT: vpor %xmm7, %xmm9, %xmm7 10922; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10923; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 10924; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm7, %ymm1 10925; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10926; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 10927; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 10928; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm5 10929; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10930; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm4, %ymm1 10931; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm4 10932; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm3 10933; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 10934; AVX2-FP-NEXT: vpor %xmm3, %xmm1, %xmm1 10935; AVX2-FP-NEXT: vpshufb %xmm2, %xmm11, %xmm2 10936; AVX2-FP-NEXT: vpshufb %xmm8, %xmm13, %xmm3 10937; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2 10938; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 10939; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 10940; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 10941; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10942; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10943; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10944; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 10945; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm1 10946; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] 10947; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm12 10948; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm13 10949; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 10950; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm3 10951; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10952; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 10953; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 10954; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10955; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 10956; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10957; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm3 10958; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10959; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm3 10960; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm14 10961; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm4 10962; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10963; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm0 10964; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10965; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 10966; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 10967; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 10968; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm6, %ymm2, %ymm0 10969; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm4 10970; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm9 10971; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm2, %ymm6, %ymm8 10972; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10973; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm2, %ymm6, %ymm2 10974; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10975; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 10976; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 10977; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm2 10978; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm7, %ymm8, %ymm6 10979; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm7, %ymm8, %ymm10 10980; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm8, %ymm7, %ymm5 10981; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10982; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 10983; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm8, %ymm7, %ymm11 10984; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u] 10985; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm15 10986; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1 10987; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u] 10988; AVX2-FP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 10989; AVX2-FP-NEXT: vpor %xmm1, %xmm15, %xmm1 10990; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm15 10991; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8,9,10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15] 10992; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] 10993; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1] 10994; AVX2-FP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 10995; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm15 = [18446744073709551615,255] 10996; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 10997; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 10998; AVX2-FP-NEXT: vpshufb %xmm8, %xmm3, %xmm0 10999; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm1 11000; AVX2-FP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 11001; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm0 11002; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm1 11003; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] 11004; AVX2-FP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 11005; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 11006; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11007; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u] 11008; AVX2-FP-NEXT: vpshufb %xmm0, %xmm12, %xmm1 11009; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm2 11010; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u] 11011; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 11012; AVX2-FP-NEXT: vpor %xmm1, %xmm2, %xmm1 11013; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm2 11014; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5,6],ymm2[7,8],ymm4[9,10],ymm2[11],ymm4[12,13,14],ymm2[15] 11015; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] 11016; AVX2-FP-NEXT: # ymm4 = mem[0,1,0,1] 11017; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 11018; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm12 11019; AVX2-FP-NEXT: vpshufb %xmm0, %xmm14, %xmm0 11020; AVX2-FP-NEXT: vextracti128 $1, %ymm14, %xmm1 11021; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 11022; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm0 11023; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm1 11024; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4,5,6],ymm1[7,8],ymm6[9,10],ymm1[11],ymm6[12,13,14],ymm1[15] 11025; AVX2-FP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 11026; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm14 11027; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm0 11028; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u] 11029; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 11030; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u] 11031; AVX2-FP-NEXT: vpshufb %xmm2, %xmm13, %xmm3 11032; AVX2-FP-NEXT: vpor %xmm0, %xmm3, %xmm0 11033; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm3 11034; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2,3],ymm3[4],ymm9[5,6],ymm3[7,8],ymm9[9,10,11],ymm3[12],ymm9[13,14],ymm3[15] 11035; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13] 11036; AVX2-FP-NEXT: # ymm4 = mem[0,1,0,1] 11037; AVX2-FP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 11038; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm0, %ymm3, %ymm6 11039; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 11040; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm0 11041; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 11042; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm1 11043; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm0 11044; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm1 11045; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm10[1,2,3],ymm1[4],ymm10[5,6],ymm1[7,8],ymm10[9,10,11],ymm1[12],ymm10[13,14],ymm1[15] 11046; AVX2-FP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 11047; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 11048; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11049; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm1 11050; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u] 11051; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 11052; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u] 11053; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 11054; AVX2-FP-NEXT: vpor %xmm1, %xmm2, %xmm1 11055; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 11056; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm2 11057; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6,7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14,15] 11058; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] 11059; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1] 11060; AVX2-FP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 11061; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm2 11062; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 11063; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm1 11064; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 11065; AVX2-FP-NEXT: vpshufb %xmm4, %xmm7, %xmm3 11066; AVX2-FP-NEXT: vpor %xmm1, %xmm3, %xmm1 11067; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 11068; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm3 11069; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] 11070; AVX2-FP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 11071; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm3, %ymm1 11072; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u] 11073; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 11074; AVX2-FP-NEXT: vpshufb %xmm4, %xmm5, %xmm3 11075; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm5 11076; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u] 11077; AVX2-FP-NEXT: vpshufb %xmm7, %xmm5, %xmm5 11078; AVX2-FP-NEXT: vpor %xmm3, %xmm5, %xmm3 11079; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 11080; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm5 11081; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7,8],ymm5[9],ymm8[10,11,12],ymm5[13],ymm8[14,15] 11082; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] 11083; AVX2-FP-NEXT: # ymm8 = mem[0,1,0,1] 11084; AVX2-FP-NEXT: vpshufb %ymm8, %ymm5, %ymm5 11085; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm3, %ymm5, %ymm3 11086; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 11087; AVX2-FP-NEXT: vpshufb %xmm4, %xmm5, %xmm4 11088; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm5 11089; AVX2-FP-NEXT: vpshufb %xmm7, %xmm5, %xmm5 11090; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 11091; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm5 11092; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0],ymm5[1],ymm11[2,3,4],ymm5[5],ymm11[6,7,8],ymm5[9],ymm11[10,11,12],ymm5[13],ymm11[14,15] 11093; AVX2-FP-NEXT: vpshufb %ymm8, %ymm5, %ymm5 11094; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 11095; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 11096; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm5 11097; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,128,128,128,5,12,128,128,1,8,15,u,u,u,u] 11098; AVX2-FP-NEXT: vpshufb %xmm7, %xmm5, %xmm5 11099; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,0,7,14,128,128,3,10,128,128,128,u,u,u,u] 11100; AVX2-FP-NEXT: vpshufb %xmm8, %xmm9, %xmm9 11101; AVX2-FP-NEXT: vpor %xmm5, %xmm9, %xmm5 11102; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] 11103; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 11104; AVX2-FP-NEXT: vpshufb %xmm13, %xmm9, %xmm9 11105; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13] 11106; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 11107; AVX2-FP-NEXT: vpshufb %xmm10, %xmm11, %xmm11 11108; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] 11109; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 11110; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 11111; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7] 11112; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 11113; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1,2,3,4,5,6,7],ymm9[8],ymm5[9,10,11,12,13,14,15] 11114; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] 11115; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 11116; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm9 11117; AVX2-FP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 11118; AVX2-FP-NEXT: vpshufb %xmm8, %xmm11, %xmm8 11119; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm7 11120; AVX2-FP-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload 11121; AVX2-FP-NEXT: vpshufb %xmm13, %xmm8, %xmm8 11122; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 11123; AVX2-FP-NEXT: vpshufb %xmm10, %xmm9, %xmm9 11124; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] 11125; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 11126; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 11127; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] 11128; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 11129; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] 11130; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] 11131; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload 11132; AVX2-FP-NEXT: # ymm8 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15] 11133; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] 11134; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload 11135; AVX2-FP-NEXT: # ymm9 = ymm14[0],mem[1,2,3,4,5,6,7],ymm14[8],mem[9,10,11,12,13,14,15] 11136; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] 11137; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload 11138; AVX2-FP-NEXT: # ymm10 = ymm6[0],mem[1,2,3,4,5,6,7],ymm6[8],mem[9,10,11,12,13,14,15] 11139; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] 11140; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload 11141; AVX2-FP-NEXT: # ymm10 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] 11142; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] 11143; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload 11144; AVX2-FP-NEXT: # ymm10 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] 11145; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] 11146; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload 11147; AVX2-FP-NEXT: # ymm10 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] 11148; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] 11149; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload 11150; AVX2-FP-NEXT: # ymm10 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] 11151; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] 11152; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload 11153; AVX2-FP-NEXT: # ymm10 = ymm4[0],mem[1,2,3,4,5,6,7],ymm4[8],mem[9,10,11,12,13,14,15] 11154; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] 11155; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 11156; AVX2-FP-NEXT: vmovaps %ymm10, 32(%rsi) 11157; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 11158; AVX2-FP-NEXT: vmovaps %ymm10, (%rsi) 11159; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 11160; AVX2-FP-NEXT: vmovaps %ymm10, 32(%rdx) 11161; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 11162; AVX2-FP-NEXT: vmovaps %ymm10, (%rdx) 11163; AVX2-FP-NEXT: vmovdqa %ymm5, 32(%rcx) 11164; AVX2-FP-NEXT: vmovdqa %ymm7, (%rcx) 11165; AVX2-FP-NEXT: vmovdqa %ymm8, 32(%r8) 11166; AVX2-FP-NEXT: vmovdqa %ymm9, (%r8) 11167; AVX2-FP-NEXT: vmovdqa %ymm6, 32(%r9) 11168; AVX2-FP-NEXT: vmovdqa %ymm0, (%r9) 11169; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 11170; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%rax) 11171; AVX2-FP-NEXT: vmovdqa %ymm1, (%rax) 11172; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 11173; AVX2-FP-NEXT: vmovdqa %ymm3, 32(%rax) 11174; AVX2-FP-NEXT: vmovdqa %ymm4, (%rax) 11175; AVX2-FP-NEXT: addq $760, %rsp # imm = 0x2F8 11176; AVX2-FP-NEXT: vzeroupper 11177; AVX2-FP-NEXT: retq 11178; 11179; AVX2-FCP-LABEL: load_i8_stride7_vf64: 11180; AVX2-FCP: # %bb.0: 11181; AVX2-FCP-NEXT: subq $776, %rsp # imm = 0x308 11182; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm15 11183; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm6 11184; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm10 11185; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm13 11186; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 11187; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm11 11188; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm9 11189; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 11190; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm13, %ymm12, %ymm0 11191; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11192; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11193; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 11194; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u] 11195; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm3 11196; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u] 11197; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 11198; AVX2-FCP-NEXT: vpor %xmm3, %xmm0, %xmm0 11199; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 11200; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm9, %ymm11, %ymm3 11201; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11202; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11203; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 11204; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] 11205; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] 11206; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] 11207; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm5 11208; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0] 11209; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 11210; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11211; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm6, %ymm10, %ymm5 11212; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11213; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm7 11214; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11215; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 11216; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm1 11217; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm8 11218; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm2 11219; AVX2-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 11220; AVX2-FCP-NEXT: vmovdqa %ymm15, %ymm0 11221; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11222; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm15, %ymm8, %ymm2 11223; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11224; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm5 11225; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] 11226; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 11227; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 11228; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11229; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 11230; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm11, %ymm9, %ymm2 11231; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 11232; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] 11233; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 11234; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm13, %ymm12, %ymm4 11235; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 11236; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u] 11237; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 11238; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u] 11239; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4 11240; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 11241; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] 11242; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] 11243; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 11244; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm1 = [0,18446744073709551360,16777215,0] 11245; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 11246; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11247; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm8, %ymm0, %ymm3 11248; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8 11249; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4,5],ymm8[6],ymm3[7,8,9],ymm8[10],ymm3[11,12,13],ymm8[14],ymm3[15] 11250; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 11251; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm10, %ymm5 11252; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm8 11253; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 11254; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 11255; AVX2-FCP-NEXT: vpor %xmm2, %xmm5, %xmm2 11256; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm2, %ymm5 11257; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm10 11258; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 11259; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 11260; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm10, %ymm12, %ymm2 11261; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11262; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11263; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 11264; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,128,128,3,10,128,128,128,6,13,u,u,u,u] 11265; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 11266; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,5,12,128,128,1,8,15,128,128,u,u,u,u] 11267; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 11268; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 11269; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 11270; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm8 11271; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,2,0,2,1,2,4,6] 11272; AVX2-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm3 11273; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11274; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13] 11275; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm3 11276; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 11277; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm6 = [18446744073709551615,18446744073709551615,16777215,0] 11278; AVX2-FCP-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload 11279; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11280; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm2 11281; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm14 11282; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm2, %ymm14, %ymm0 11283; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm3 11284; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm13 11285; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm7 11286; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm0 11287; AVX2-FCP-NEXT: vpor %xmm7, %xmm0, %xmm0 11288; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm9 11289; AVX2-FCP-NEXT: vpermd %ymm9, %ymm4, %ymm4 11290; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11291; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1 11292; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 11293; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] 11294; AVX2-FCP-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11295; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11296; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 11297; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm0 11298; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,6,13,128,128,2,9,128,128,128,u,u,u,u] 11299; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm4 11300; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 11301; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,128,128,4,11,128,128,0,7,14,u,u,u,u] 11302; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 11303; AVX2-FCP-NEXT: vpor %xmm4, %xmm0, %xmm0 11304; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 11305; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,0,2,1,3,4,6] 11306; AVX2-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm11 11307; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14] 11308; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm11 11309; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] 11310; AVX2-FCP-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 11311; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11312; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm2, %ymm14, %ymm0 11313; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm1 11314; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 11315; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 11316; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 11317; AVX2-FCP-NEXT: vpermd %ymm9, %ymm4, %ymm1 11318; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 11319; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 11320; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] 11321; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 11322; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11323; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 11324; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm14, %ymm2, %ymm0 11325; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11326; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 11327; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,128,128,128,6,13,128,128,2,9,u,u,u,u,u] 11328; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1 11329; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,1,8,15,128,128,4,11,128,128,u,u,u,u,u] 11330; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0 11331; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 11332; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] 11333; AVX2-FCP-NEXT: vmovdqa 432(%rdi), %xmm13 11334; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm1 11335; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] 11336; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %xmm15 11337; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm15, %xmm12 11338; AVX2-FCP-NEXT: vpor %xmm1, %xmm12, %xmm1 11339; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12 11340; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 11341; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] 11342; AVX2-FCP-NEXT: # ymm9 = mem[0,1,0,1] 11343; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm12, %ymm1, %ymm1 11344; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11345; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11346; AVX2-FCP-NEXT: vpblendvb %ymm7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload 11347; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm12 11348; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm12, %xmm4 11349; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 11350; AVX2-FCP-NEXT: vpor %xmm4, %xmm1, %xmm6 11351; AVX2-FCP-NEXT: vmovdqa 208(%rdi), %xmm2 11352; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm10 11353; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11354; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm8 11355; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm11 11356; AVX2-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 11357; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 11358; AVX2-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 11359; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm10, %ymm1 11360; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11361; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 11362; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm14, %ymm3, %ymm6 11363; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm14 11364; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm3 11365; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,2,9,128,128,128,5,12,128,128,u,u,u,u,u] 11366; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm11 11367; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 11368; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u] 11369; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm6 11370; AVX2-FCP-NEXT: vpor %xmm6, %xmm11, %xmm6 11371; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] 11372; AVX2-FCP-NEXT: vmovdqa %xmm13, %xmm5 11373; AVX2-FCP-NEXT: vmovdqa %xmm13, (%rsp) # 16-byte Spill 11374; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm13 11375; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] 11376; AVX2-FCP-NEXT: vmovdqa %xmm15, %xmm7 11377; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm15 11378; AVX2-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13 11379; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 11380; AVX2-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 11381; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm13, %ymm0 11382; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11383; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11384; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 11385; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm4, %ymm0, %ymm6 11386; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm10 11387; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 11388; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm6 11389; AVX2-FCP-NEXT: vpor %xmm6, %xmm10, %xmm6 11390; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm10 11391; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm1 11392; AVX2-FCP-NEXT: vpor %xmm1, %xmm10, %xmm1 11393; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 11394; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 11395; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm1, %ymm1 11396; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11397; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] 11398; AVX2-FCP-NEXT: vmovdqa %ymm14, %ymm3 11399; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11400; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm14, %ymm1 11401; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,3,10,128,128,128,6,13,128,128,u,u,u,u,u] 11402; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm10 11403; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 11404; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u] 11405; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1 11406; AVX2-FCP-NEXT: vpor %xmm1, %xmm10, %xmm1 11407; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] 11408; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm13 11409; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm15 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] 11410; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm11 11411; AVX2-FCP-NEXT: vpor %xmm13, %xmm11, %xmm11 11412; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 11413; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 11414; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm11, %ymm1 11415; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11416; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm1 11417; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm13 11418; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm6 11419; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 11420; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1 11421; AVX2-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1 11422; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm6 11423; AVX2-FCP-NEXT: vmovdqa %xmm8, %xmm14 11424; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 11425; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm10 11426; AVX2-FCP-NEXT: vpor %xmm6, %xmm10, %xmm6 11427; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 11428; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 11429; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm6, %ymm0 11430; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11431; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 11432; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm2, %ymm3, %ymm1 11433; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11434; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] 11435; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm1 11436; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 11437; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,128,128,2,9,128,128,128,5,12,u,u,u,u,u] 11438; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 11439; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,4,11,128,128,0,7,14,128,128,u,u,u,u,u] 11440; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1 11441; AVX2-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 11442; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] 11443; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm5 11444; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] 11445; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 11446; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm8 11447; AVX2-FCP-NEXT: vpor %xmm5, %xmm8, %xmm5 11448; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 11449; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 11450; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm5, %ymm0 11451; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 11452; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm0 11453; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm4, %ymm13, %ymm4 11454; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11455; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm13, %ymm0, %ymm1 11456; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 11457; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 11458; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1 11459; AVX2-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1 11460; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm2 11461; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 11462; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm3 11463; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 11464; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 11465; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 11466; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm0 11467; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11468; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11469; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11470; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 11471; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm11 11472; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535] 11473; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm9 11474; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm10 11475; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 11476; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm2 11477; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11478; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] 11479; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 11480; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11481; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11482; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 11483; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm2 11484; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11485; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm3 11486; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm8 11487; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm2 11488; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11489; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 11490; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11491; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11492; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 11493; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0] 11494; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm1 11495; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm4 11496; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm14 11497; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm13 11498; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11499; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 11500; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11501; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm13 11502; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 11503; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 11504; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm5, %ymm2 11505; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm5, %ymm12 11506; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm0, %ymm5, %ymm15 11507; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm6 11508; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11509; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm5, %ymm0, %ymm0 11510; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11511; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u] 11512; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm0 11513; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 11514; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u] 11515; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm11 11516; AVX2-FCP-NEXT: vpor %xmm0, %xmm11, %xmm0 11517; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm11 11518; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7,8,9,10],ymm11[11],ymm1[12,13],ymm11[14],ymm1[15] 11519; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] 11520; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1] 11521; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 11522; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm11 = [18446744073709551615,255] 11523; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 11524; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11525; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm0 11526; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm1 11527; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 11528; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 11529; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm1 11530; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] 11531; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 11532; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 11533; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11534; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u] 11535; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm1 11536; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm2 11537; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u] 11538; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 11539; AVX2-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 11540; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm2 11541; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5,6],ymm2[7,8],ymm4[9,10],ymm2[11],ymm4[12,13,14],ymm2[15] 11542; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] 11543; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] 11544; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 11545; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm9 11546; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm0 11547; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm1 11548; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 11549; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 11550; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm1 11551; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2],ymm1[3],ymm12[4,5,6],ymm1[7,8],ymm12[9,10],ymm1[11],ymm12[12,13,14],ymm1[15] 11552; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 11553; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm12 11554; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm0 11555; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u] 11556; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 11557; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u] 11558; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm3 11559; AVX2-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 11560; AVX2-FCP-NEXT: vextracti128 $1, %ymm14, %xmm3 11561; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1,2,3],ymm3[4],ymm14[5,6],ymm3[7,8],ymm14[9,10,11],ymm3[12],ymm14[13,14],ymm3[15] 11562; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13] 11563; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] 11564; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 11565; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm14 11566; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 11567; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm0 11568; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 11569; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm1 11570; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 11571; AVX2-FCP-NEXT: vextracti128 $1, %ymm15, %xmm1 11572; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm15[1,2,3],ymm1[4],ymm15[5,6],ymm1[7,8],ymm15[9,10,11],ymm1[12],ymm15[13,14],ymm1[15] 11573; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 11574; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm15 11575; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 11576; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm0 11577; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u] 11578; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 11579; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u] 11580; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 11581; AVX2-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 11582; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 11583; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm3 11584; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] 11585; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] 11586; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] 11587; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 11588; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm0 11589; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 11590; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm3 11591; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1 11592; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm2 11593; AVX2-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 11594; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 11595; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm2 11596; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] 11597; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 11598; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 11599; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u] 11600; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 11601; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm2 11602; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 11603; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u] 11604; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 11605; AVX2-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2 11606; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11607; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm4 11608; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7,8],ymm4[9],ymm6[10,11,12],ymm4[13],ymm6[14,15] 11609; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] 11610; AVX2-FCP-NEXT: # ymm6 = mem[0,1,0,1] 11611; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 11612; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm2, %ymm4, %ymm2 11613; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 11614; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm3 11615; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 11616; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 11617; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 11618; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 11619; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm4 11620; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15] 11621; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 11622; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3 11623; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 11624; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm4 11625; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,128,128,128,5,12,128,128,1,8,15,u,u,u,u] 11626; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 11627; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,0,7,14,128,128,3,10,128,128,128,u,u,u,u] 11628; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7 11629; AVX2-FCP-NEXT: vpor %xmm4, %xmm7, %xmm4 11630; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 11631; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,1,2,1,3,5,6] 11632; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload 11633; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15] 11634; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm8 11635; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] 11636; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 11637; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm4[1,2,3,4,5,6,7],ymm8[8],ymm4[9,10,11,12,13,14,15] 11638; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] 11639; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 11640; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm8 11641; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm5 11642; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm6 11643; AVX2-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 11644; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload 11645; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6 11646; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 11647; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] 11648; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 11649; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15] 11650; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] 11651; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload 11652; AVX2-FCP-NEXT: # ymm6 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15] 11653; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] 11654; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload 11655; AVX2-FCP-NEXT: # ymm7 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15] 11656; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm7[4,5,6,7] 11657; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload 11658; AVX2-FCP-NEXT: # ymm8 = ymm14[0],mem[1,2,3,4,5,6,7],ymm14[8],mem[9,10,11,12,13,14,15] 11659; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] 11660; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload 11661; AVX2-FCP-NEXT: # ymm9 = ymm15[0],mem[1,2,3,4,5,6,7],ymm15[8],mem[9,10,11,12,13,14,15] 11662; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] 11663; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload 11664; AVX2-FCP-NEXT: # ymm10 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] 11665; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] 11666; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload 11667; AVX2-FCP-NEXT: # ymm10 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] 11668; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] 11669; AVX2-FCP-NEXT: vpblendw $254, (%rsp), %ymm2, %ymm10 # 32-byte Folded Reload 11670; AVX2-FCP-NEXT: # ymm10 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] 11671; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] 11672; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload 11673; AVX2-FCP-NEXT: # ymm10 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] 11674; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] 11675; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 11676; AVX2-FCP-NEXT: vmovaps %ymm10, 32(%rsi) 11677; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 11678; AVX2-FCP-NEXT: vmovaps %ymm10, (%rsi) 11679; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 11680; AVX2-FCP-NEXT: vmovaps %ymm10, 32(%rdx) 11681; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 11682; AVX2-FCP-NEXT: vmovaps %ymm10, (%rdx) 11683; AVX2-FCP-NEXT: vmovdqa %ymm4, 32(%rcx) 11684; AVX2-FCP-NEXT: vmovdqa %ymm5, (%rcx) 11685; AVX2-FCP-NEXT: vmovdqa %ymm6, 32(%r8) 11686; AVX2-FCP-NEXT: vmovdqa %ymm7, (%r8) 11687; AVX2-FCP-NEXT: vmovdqa %ymm8, 32(%r9) 11688; AVX2-FCP-NEXT: vmovdqa %ymm9, (%r9) 11689; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 11690; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rax) 11691; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rax) 11692; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 11693; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%rax) 11694; AVX2-FCP-NEXT: vmovdqa %ymm3, (%rax) 11695; AVX2-FCP-NEXT: addq $776, %rsp # imm = 0x308 11696; AVX2-FCP-NEXT: vzeroupper 11697; AVX2-FCP-NEXT: retq 11698; 11699; AVX512-LABEL: load_i8_stride7_vf64: 11700; AVX512: # %bb.0: 11701; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] 11702; AVX512-NEXT: vmovdqa (%rdi), %ymm12 11703; AVX512-NEXT: vmovdqa 32(%rdi), %ymm13 11704; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm31 11705; AVX512-NEXT: vmovdqa %ymm0, %ymm1 11706; AVX512-NEXT: vmovdqa64 %ymm0, %ymm24 11707; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm13 ^ (ymm1 & (ymm12 ^ ymm13)) 11708; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 11709; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u] 11710; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] 11711; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1 11712; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] 11713; AVX512-NEXT: vmovdqa64 96(%rdi), %ymm19 11714; AVX512-NEXT: vmovdqa %ymm9, %ymm2 11715; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm31 ^ (ymm2 & (ymm19 ^ ymm31)) 11716; AVX512-NEXT: vmovdqa 80(%rdi), %xmm11 11717; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm11[2],ymm2[3,4],ymm11[5],ymm2[6,7,8,9],ymm11[10],ymm2[11,12],ymm11[13],ymm2[14,15] 11718; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 11719; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm1 & mem) 11720; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] 11721; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm21 11722; AVX512-NEXT: vmovdqa64 160(%rdi), %ymm29 11723; AVX512-NEXT: vmovdqa %ymm14, %ymm1 11724; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm21 ^ (ymm1 & (ymm29 ^ ymm21)) 11725; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3 11726; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u] 11727; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] 11728; AVX512-NEXT: vpor %xmm3, %xmm1, %xmm1 11729; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 11730; AVX512-NEXT: vmovdqa 192(%rdi), %xmm0 11731; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] 11732; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm3 11733; AVX512-NEXT: vmovdqa64 %xmm4, %xmm27 11734; AVX512-NEXT: vmovdqa64 %xmm0, %xmm20 11735; AVX512-NEXT: vmovdqa 208(%rdi), %xmm10 11736; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] 11737; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] 11738; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 11739; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] 11740; AVX512-NEXT: vmovdqa 240(%rdi), %xmm3 11741; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u,u,u,u,u] 11742; AVX512-NEXT: vmovdqa 224(%rdi), %xmm6 11743; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u] 11744; AVX512-NEXT: vpor %xmm5, %xmm7, %xmm5 11745; AVX512-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm22 11746; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 11747; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm4 & (zmm22 ^ zmm2)) 11748; AVX512-NEXT: vmovdqa64 288(%rdi), %ymm18 11749; AVX512-NEXT: vmovdqa64 256(%rdi), %ymm16 11750; AVX512-NEXT: vmovdqa %ymm9, %ymm2 11751; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm16 ^ ymm18)) 11752; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u] 11753; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 11754; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u] 11755; AVX512-NEXT: vpor %xmm5, %xmm2, %xmm2 11756; AVX512-NEXT: vmovdqa64 352(%rdi), %ymm17 11757; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm28 11758; AVX512-NEXT: vmovdqa %ymm14, %ymm7 11759; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm28 ^ ymm17)) 11760; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] 11761; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7,8,9],ymm8[10],ymm7[11,12,13],ymm8[14],ymm7[15] 11762; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] 11763; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] 11764; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm2 & ymm23) 11765; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] 11766; AVX512-NEXT: vmovdqa %ymm7, %ymm2 11767; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm13 ^ (ymm2 & (ymm12 ^ ymm13)) 11768; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm15 11769; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u] 11770; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u] 11771; AVX512-NEXT: vpor %xmm2, %xmm15, %xmm2 11772; AVX512-NEXT: vmovdqa64 %ymm24, %ymm15 11773; AVX512-NEXT: vmovdqa64 %ymm24, %ymm5 11774; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm19 ^ (ymm15 & (ymm31 ^ ymm19)) 11775; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm11[2],ymm15[3,4,5],ymm11[6],ymm15[7,8,9],ymm11[10],ymm15[11,12,13],ymm11[14],ymm15[15] 11776; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] 11777; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm2 & ~mem) 11778; AVX512-NEXT: vmovdqa %ymm9, %ymm2 11779; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm21 ^ (ymm2 & (ymm29 ^ ymm21)) 11780; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u] 11781; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 11782; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u] 11783; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 11784; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u,u,u,u,u] 11785; AVX512-NEXT: vmovdqa64 %xmm3, %xmm25 11786; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u] 11787; AVX512-NEXT: vmovdqa64 %xmm6, %xmm26 11788; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 11789; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] 11790; AVX512-NEXT: vmovdqa64 %xmm20, %xmm1 11791; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] 11792; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] 11793; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 11794; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 11795; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] 11796; AVX512-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm24 11797; AVX512-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm4 & (zmm24 ^ zmm15)) 11798; AVX512-NEXT: vmovdqa %ymm14, %ymm0 11799; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm12 ^ ymm13)) 11800; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u] 11801; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 11802; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u] 11803; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 11804; AVX512-NEXT: vmovdqa %ymm7, %ymm2 11805; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm19 ^ (ymm2 & (ymm31 ^ ymm19)) 11806; AVX512-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11807; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm11[3],ymm2[4,5],ymm11[6],ymm2[7,8,9,10],ymm11[11],ymm2[12,13],ymm11[14],ymm2[15] 11808; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 11809; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm15 = [18446744073709551615,255,18446744073709486080,18446744073709551615] 11810; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ymm15) 11811; AVX512-NEXT: vmovdqa %ymm5, %ymm0 11812; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm29 ^ (ymm0 & (ymm21 ^ ymm29)) 11813; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 11814; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u] 11815; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u] 11816; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0 11817; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 11818; AVX512-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11819; AVX512-NEXT: vmovdqa64 %xmm27, %xmm3 11820; AVX512-NEXT: vpshufb %xmm3, %xmm10, %xmm3 11821; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] 11822; AVX512-NEXT: vmovdqa64 %xmm20, %xmm5 11823; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] 11824; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 11825; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] 11826; AVX512-NEXT: vmovdqa64 %xmm26, %xmm1 11827; AVX512-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11828; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] 11829; AVX512-NEXT: vmovdqa64 %xmm25, %xmm4 11830; AVX512-NEXT: vmovdqa64 %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 11831; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u] 11832; AVX512-NEXT: vpor %xmm3, %xmm6, %xmm3 11833; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm25 11834; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm20 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] 11835; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = zmm2 ^ (zmm20 & (zmm25 ^ zmm2)) 11836; AVX512-NEXT: vmovdqa %ymm9, %ymm0 11837; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm12 ^ ymm13)) 11838; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] 11839; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 11840; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] 11841; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 11842; AVX512-NEXT: vmovdqa %ymm14, %ymm2 11843; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm19 ^ (ymm2 & (ymm31 ^ ymm19)) 11844; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm2[1,2],ymm11[3],ymm2[4,5,6],ymm11[7,8],ymm2[9,10],ymm11[11],ymm2[12,13,14],ymm11[15] 11845; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 11846; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm15) 11847; AVX512-NEXT: vmovdqa %ymm15, %ymm11 11848; AVX512-NEXT: vmovdqa %ymm7, %ymm0 11849; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm29 ^ (ymm0 & (ymm21 ^ ymm29)) 11850; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 11851; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[2,9,u,u,u,u,u] 11852; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] 11853; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 11854; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 11855; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] 11856; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm6 11857; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero 11858; AVX512-NEXT: vmovdqa %xmm5, %xmm10 11859; AVX512-NEXT: vpor %xmm6, %xmm15, %xmm6 11860; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 11861; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm0)) 11862; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] 11863; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u] 11864; AVX512-NEXT: vpor %xmm0, %xmm15, %xmm0 11865; AVX512-NEXT: vmovdqa64 416(%rdi), %ymm26 11866; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm30 11867; AVX512-NEXT: vmovdqa64 384(%rdi), %ymm27 11868; AVX512-NEXT: vpternlogq {{.*#+}} zmm30 = zmm3 ^ (zmm20 & (zmm30 ^ zmm3)) 11869; AVX512-NEXT: vmovdqa %ymm7, %ymm0 11870; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26)) 11871; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 11872; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9] 11873; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero 11874; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0 11875; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 11876; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] 11877; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm1 & (ymm0 ^ ymm8)) 11878; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20 11879; AVX512-NEXT: vpmovsxwd {{.*#+}} zmm8 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0] 11880; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm8 & (zmm20 ^ zmm22)) 11881; AVX512-NEXT: vmovdqa %ymm7, %ymm0 11882; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm18 ^ ymm16)) 11883; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 11884; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u] 11885; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u] 11886; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0 11887; AVX512-NEXT: vmovdqa %ymm9, %ymm3 11888; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm17 ^ (ymm3 & (ymm28 ^ ymm17)) 11889; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] 11890; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7,8,9,10],ymm6[11],ymm3[12,13],ymm6[14],ymm3[15] 11891; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] 11892; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm23) 11893; AVX512-NEXT: vmovdqa %ymm14, %ymm0 11894; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26)) 11895; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero 11896; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 11897; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10] 11898; AVX512-NEXT: vpor %xmm6, %xmm0, %xmm0 11899; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 11900; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm1 & (ymm0 ^ ymm3)) 11901; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm22 11902; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm8 & (zmm22 ^ zmm24)) 11903; AVX512-NEXT: vmovdqa %ymm14, %ymm0 11904; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm18 ^ ymm16)) 11905; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 11906; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u] 11907; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u] 11908; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0 11909; AVX512-NEXT: vmovdqa %ymm7, %ymm3 11910; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm28 ^ (ymm3 & (ymm17 ^ ymm28)) 11911; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] 11912; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1,2],ymm6[3],ymm3[4,5,6],ymm6[7,8],ymm3[9,10],ymm6[11],ymm3[12,13,14],ymm6[15] 11913; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] 11914; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm23) 11915; AVX512-NEXT: vmovdqa %ymm9, %ymm0 11916; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26)) 11917; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero 11918; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 11919; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11] 11920; AVX512-NEXT: vpor %xmm6, %xmm0, %xmm0 11921; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 11922; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm1 & (ymm0 ^ ymm3)) 11923; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm24 11924; AVX512-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm8 & (zmm24 ^ zmm25)) 11925; AVX512-NEXT: vmovdqa %ymm14, %ymm0 11926; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm28 ^ (ymm0 & (ymm17 ^ ymm28)) 11927; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] 11928; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6],ymm3[7,8],ymm0[9,10,11],ymm3[12],ymm0[13,14],ymm3[15] 11929; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm0 11930; AVX512-NEXT: vmovdqa %ymm9, %ymm2 11931; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm16 ^ (ymm2 & (ymm18 ^ ymm16)) 11932; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u] 11933; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 11934; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u] 11935; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 11936; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm23) | ymm0 11937; AVX512-NEXT: vmovdqa %ymm7, %ymm0 11938; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27)) 11939; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 11940; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12] 11941; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero 11942; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0 11943; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 11944; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm1 & (ymm0 ^ ymm2)) 11945; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm25 11946; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm8 & (zmm25 ^ zmm30)) 11947; AVX512-NEXT: vmovdqa %ymm9, %ymm0 11948; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm28 ^ (ymm0 & (ymm17 ^ ymm28)) 11949; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] 11950; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15] 11951; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] 11952; AVX512-NEXT: vmovdqa %ymm4, %ymm2 11953; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm16 ^ ymm18)) 11954; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 11955; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u] 11956; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u] 11957; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 11958; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] 11959; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm23) | ymm0 11960; AVX512-NEXT: vmovdqa %ymm14, %ymm0 11961; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27)) 11962; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 11963; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13] 11964; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero 11965; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0 11966; AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm30 11967; AVX512-NEXT: vpternlogq {{.*#+}} ymm30 = ymm30 ^ (ymm1 & (ymm30 ^ ymm2)) 11968; AVX512-NEXT: vmovdqa %ymm4, %ymm0 11969; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm28 ^ ymm17)) 11970; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] 11971; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] 11972; AVX512-NEXT: vmovdqa %ymm7, %ymm2 11973; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm16 ^ ymm18)) 11974; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 11975; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u] 11976; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u] 11977; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 11978; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] 11979; AVX512-NEXT: vpshufb %ymm5, %ymm0, %ymm0 11980; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ~mem) | ymm0 11981; AVX512-NEXT: vmovdqa %ymm9, %ymm0 11982; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27)) 11983; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero 11984; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 11985; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] 11986; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0 11987; AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm23 11988; AVX512-NEXT: vpternlogq {{.*#+}} ymm23 = ymm23 ^ (ymm1 & (ymm23 ^ ymm2)) 11989; AVX512-NEXT: vmovdqa %ymm7, %ymm0 11990; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm13 ^ ymm12)) 11991; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 11992; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] 11993; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u] 11994; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 11995; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm26 ^ (ymm4 & (ymm27 ^ ymm26)) 11996; AVX512-NEXT: vmovdqa %ymm14, %ymm2 11997; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm12 ^ (ymm2 & (ymm13 ^ ymm12)) 11998; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 11999; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u] 12000; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u,u,u,u,u,u] 12001; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 12002; AVX512-NEXT: vmovdqa %ymm9, %ymm3 12003; AVX512-NEXT: vmovdqa %ymm9, %ymm15 12004; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm12 ^ (ymm9 & (ymm13 ^ ymm12)) 12005; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm19 ^ (ymm3 & (ymm31 ^ ymm19)) 12006; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u] 12007; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm9 12008; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[4,11],zero,zero,xmm9[0,7,14,u,u,u,u,u,u,u] 12009; AVX512-NEXT: vpor %xmm6, %xmm9, %xmm6 12010; AVX512-NEXT: vmovdqa %ymm14, %ymm12 12011; AVX512-NEXT: vpternlogq {{.*#+}} ymm16 = ymm18 ^ (ymm14 & (ymm16 ^ ymm18)) 12012; AVX512-NEXT: vmovdqa %ymm7, %ymm9 12013; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm19 ^ ymm31)) 12014; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm31 ^ (ymm14 & (ymm19 ^ ymm31)) 12015; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 12016; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1,2,3],ymm8[4],ymm3[5,6],ymm8[7,8],ymm3[9,10,11],ymm8[12],ymm3[13,14],ymm8[15] 12017; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 12018; AVX512-NEXT: vmovdqa %ymm11, %ymm1 12019; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm11) 12020; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15] 12021; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 12022; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm2 & ymm1) 12023; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm8[1],ymm14[2,3,4],ymm8[5],ymm14[6,7,8],ymm8[9],ymm14[10,11,12],ymm8[13],ymm14[14,15] 12024; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 12025; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm6 & ymm1) 12026; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm29 ^ (ymm12 & (ymm21 ^ ymm29)) 12027; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] 12028; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm2 12029; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u] 12030; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 12031; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 12032; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] 12033; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm2 12034; AVX512-NEXT: vmovdqa %xmm10, %xmm13 12035; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero 12036; AVX512-NEXT: vpor %xmm2, %xmm6, %xmm2 12037; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 12038; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 12039; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] 12040; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm1 & (ymm2 ^ ymm0)) 12041; AVX512-NEXT: vpternlogq {{.*#+}} ymm28 = ymm17 ^ (ymm7 & (ymm28 ^ ymm17)) 12042; AVX512-NEXT: vmovd {{.*#+}} xmm10 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 12043; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm29 ^ (ymm15 & (ymm21 ^ ymm29)) 12044; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm21 ^ (ymm7 & (ymm29 ^ ymm21)) 12045; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 12046; AVX512-NEXT: vpshufb %xmm10, %xmm8, %xmm0 12047; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 12048; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm14[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 12049; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] 12050; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2 12051; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 12052; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ zmm3)) 12053; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u] 12054; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm6 12055; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u] 12056; AVX512-NEXT: vpor %xmm3, %xmm6, %xmm3 12057; AVX512-NEXT: vpshufb %xmm5, %xmm12, %xmm6 12058; AVX512-NEXT: vmovdqa %xmm12, %xmm15 12059; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero 12060; AVX512-NEXT: vpor %xmm6, %xmm12, %xmm6 12061; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 12062; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 12063; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm1 & (ymm6 ^ ymm3)) 12064; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm3 12065; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u] 12066; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u] 12067; AVX512-NEXT: vpor %xmm3, %xmm7, %xmm3 12068; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128] 12069; AVX512-NEXT: vpshufb %xmm7, %xmm15, %xmm12 12070; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero 12071; AVX512-NEXT: vpor %xmm13, %xmm12, %xmm12 12072; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 12073; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 12074; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm1 & (ymm12 ^ ymm3)) 12075; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 12076; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 12077; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] 12078; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 12079; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ zmm11)) 12080; AVX512-NEXT: vpshufb %xmm10, %xmm14, %xmm6 12081; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 12082; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] 12083; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm12, %zmm6 12084; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm0 & (zmm6 ^ zmm9)) 12085; AVX512-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm0 12086; AVX512-NEXT: movw $-512, %ax # imm = 0xFE00 12087; AVX512-NEXT: kmovw %eax, %k1 12088; AVX512-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} 12089; AVX512-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm0 12090; AVX512-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} 12091; AVX512-NEXT: vmovdqa64 %ymm28, %ymm1 12092; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm28[2,3,0,1] 12093; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] 12094; AVX512-NEXT: vpshufb %ymm7, %ymm0, %ymm0 12095; AVX512-NEXT: vmovdqa64 %ymm16, %ymm1 12096; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u] 12097; AVX512-NEXT: vextracti32x4 $1, %ymm16, %xmm1 12098; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u] 12099; AVX512-NEXT: vpor %xmm5, %xmm1, %xmm1 12100; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm0 12101; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm0 12102; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] 12103; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero 12104; AVX512-NEXT: vpor %xmm0, %xmm4, %xmm0 12105; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 12106; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] 12107; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 12108; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 12109; AVX512-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} 12110; AVX512-NEXT: vmovdqa64 %zmm20, (%rsi) 12111; AVX512-NEXT: vmovdqa64 %zmm22, (%rdx) 12112; AVX512-NEXT: vmovdqa64 %zmm24, (%rcx) 12113; AVX512-NEXT: vmovdqa64 %zmm25, (%r8) 12114; AVX512-NEXT: vmovdqa64 %zmm2, (%r9) 12115; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 12116; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) 12117; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 12118; AVX512-NEXT: vmovdqa64 %zmm6, (%rax) 12119; AVX512-NEXT: vzeroupper 12120; AVX512-NEXT: retq 12121; 12122; AVX512-FCP-LABEL: load_i8_stride7_vf64: 12123; AVX512-FCP: # %bb.0: 12124; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] 12125; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm20 12126; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 12127; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27 12128; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 12129; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm20 ^ ymm12)) 12130; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 12131; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] 12132; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] 12133; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 12134; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] 12135; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm31 12136; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm1 12137; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm27 ^ (ymm1 & (ymm31 ^ ymm27)) 12138; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm6 12139; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7,8,9],ymm6[10],ymm1[11,12],ymm6[13],ymm1[14,15] 12140; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 12141; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & mem) 12142; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] 12143; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm28 12144; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm30 12145; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm0 12146; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm28 ^ (ymm0 & (ymm30 ^ ymm28)) 12147; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 12148; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u] 12149; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u] 12150; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 12151; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 12152; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,2,4,6] 12153; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm3 12154; AVX512-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm2 12155; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] 12156; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] 12157; AVX512-FCP-NEXT: vmovdqa 240(%rdi), %xmm4 12158; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u] 12159; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %xmm5 12160; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] 12161; AVX512-FCP-NEXT: vpor %xmm2, %xmm7, %xmm2 12162; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm8 12163; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 12164; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm21 & (zmm8 ^ zmm1)) 12165; AVX512-FCP-NEXT: vmovdqa64 288(%rdi), %ymm16 12166; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm11 12167; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0 12168; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm11 ^ ymm16)) 12169; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u] 12170; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 12171; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u] 12172; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 12173; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm14 12174; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm2 12175; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm7 12176; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm14 ^ (ymm7 & (ymm2 ^ ymm14)) 12177; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm7[2,3,0,1] 12178; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm13[2],ymm7[3,4,5],ymm13[6],ymm7[7,8,9],ymm13[10],ymm7[11,12,13],ymm13[14],ymm7[15] 12179; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] 12180; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] 12181; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm0 & ymm26) 12182; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] 12183; AVX512-FCP-NEXT: vmovdqa64 416(%rdi), %ymm17 12184; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %ymm18 12185; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm7 12186; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm18 ^ ymm17)) 12187; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm15 12188; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9] 12189; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm7[4,11],zero,zero 12190; AVX512-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7 12191; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm15 12192; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm23 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] 12193; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 ^ (ymm23 & (ymm15 ^ ymm13)) 12194; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0 12195; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} zmm29 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0] 12196; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm29 & (zmm0 ^ zmm8)) 12197; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 12198; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm8 12199; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm12 ^ (ymm8 & (ymm20 ^ ymm12)) 12200; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm13 12201; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u,u,u] 12202; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u,u,u] 12203; AVX512-FCP-NEXT: vpor %xmm13, %xmm8, %xmm8 12204; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm13 12205; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm31 ^ (ymm13 & (ymm27 ^ ymm31)) 12206; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm6[2],ymm13[3,4,5],ymm6[6],ymm13[7,8,9],ymm6[10],ymm13[11,12,13],ymm6[14],ymm13[15] 12207; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] 12208; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm8 & ~mem) 12209; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm8 12210; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm28 ^ (ymm8 & (ymm30 ^ ymm28)) 12211; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,6,13],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u] 12212; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 12213; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[4,11],zero,zero,xmm8[0,7,14,u,u,u,u] 12214; AVX512-FCP-NEXT: vpor %xmm15, %xmm8, %xmm8 12215; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 12216; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,4,6] 12217; AVX512-FCP-NEXT: vpermd %ymm3, %ymm15, %ymm15 12218; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] 12219; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm15[7] 12220; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u] 12221; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] 12222; AVX512-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7 12223; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm8, %zmm7 12224; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm21 & (zmm7 ^ zmm13)) 12225; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm8 12226; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm11 ^ (ymm8 & (ymm16 ^ ymm11)) 12227; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm13 12228; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u] 12229; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u] 12230; AVX512-FCP-NEXT: vpor %xmm13, %xmm8, %xmm8 12231; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm13 12232; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm14 ^ (ymm13 & (ymm2 ^ ymm14)) 12233; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] 12234; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8,9,10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] 12235; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] 12236; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm8 & ymm26) 12237; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm8 12238; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm17 ^ (ymm8 & (ymm18 ^ ymm17)) 12239; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero 12240; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 12241; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10] 12242; AVX512-FCP-NEXT: vpor %xmm15, %xmm8, %xmm8 12243; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 12244; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm23 & (ymm8 ^ ymm13)) 12245; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm24 12246; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm29 & (zmm24 ^ zmm7)) 12247; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm7 12248; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm20 ^ ymm12)) 12249; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u] 12250; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 12251; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] 12252; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 12253; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm8 12254; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31)) 12255; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8,9,10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15] 12256; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 12257; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm25 = [18446744073709551615,255,18446744073709486080,18446744073709551615] 12258; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm25) 12259; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm7 12260; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm28 ^ ymm30)) 12261; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm13 12262; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[1,8,15,u,u,u,u] 12263; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u] 12264; AVX512-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7 12265; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 12266; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,5,6] 12267; AVX512-FCP-NEXT: vpermd %ymm3, %ymm13, %ymm3 12268; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] 12269; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,6],ymm3[7] 12270; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] 12271; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u] 12272; AVX512-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7 12273; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3 12274; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm22 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] 12275; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm8 ^ (zmm22 & (zmm3 ^ zmm8)) 12276; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm7 12277; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm11 ^ (ymm7 & (ymm16 ^ ymm11)) 12278; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 12279; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u] 12280; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u] 12281; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 12282; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm8 12283; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm2 ^ (ymm8 & (ymm14 ^ ymm2)) 12284; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1] 12285; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1,2],ymm13[3],ymm8[4,5,6],ymm13[7,8],ymm8[9,10],ymm13[11],ymm8[12,13,14],ymm13[15] 12286; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] 12287; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm26) 12288; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm7 12289; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm18 ^ ymm17)) 12290; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero 12291; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 12292; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11] 12293; AVX512-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7 12294; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 12295; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm23 & (ymm7 ^ ymm8)) 12296; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm21 12297; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm29 & (zmm21 ^ zmm3)) 12298; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm3 12299; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm12 ^ (ymm3 & (ymm20 ^ ymm12)) 12300; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u,u,u] 12301; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 12302; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u,u,u] 12303; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 12304; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm7 12305; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm27 ^ ymm31)) 12306; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4,5,6],ymm6[7,8],ymm7[9,10],ymm6[11],ymm7[12,13,14],ymm6[15] 12307; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 12308; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm25) 12309; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3 12310; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm30 ^ (ymm3 & (ymm28 ^ ymm30)) 12311; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8 12312; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u] 12313; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u] 12314; AVX512-FCP-NEXT: vpor %xmm3, %xmm8, %xmm8 12315; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] 12316; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm0 12317; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12318; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0 12319; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm15 12320; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero 12321; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 12322; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm3 12323; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 12324; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm3)) 12325; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] 12326; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u] 12327; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 12328; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm0 12329; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm7 ^ (zmm22 & (zmm0 ^ zmm7)) 12330; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm3 12331; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm14 ^ ymm2)) 12332; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] 12333; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6],ymm4[7,8],ymm3[9,10,11],ymm4[12],ymm3[13,14],ymm4[15] 12334; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3 12335; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm4 12336; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm11 ^ (ymm4 & (ymm16 ^ ymm11)) 12337; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,6,13],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[u,u] 12338; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 12339; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u] 12340; AVX512-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 12341; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ymm26) | ymm3 12342; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3 12343; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm17 ^ ymm18)) 12344; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 12345; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12] 12346; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero 12347; AVX512-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 12348; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 12349; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm23 & (ymm3 ^ ymm4)) 12350; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm22 12351; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm29 & (zmm22 ^ zmm0)) 12352; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0 12353; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm14 ^ ymm2)) 12354; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] 12355; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7,8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14,15] 12356; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] 12357; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm3 12358; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm11 ^ ymm16)) 12359; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 12360; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u] 12361; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u] 12362; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 12363; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm26) | ymm0 12364; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm0 12365; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm18 ^ (ymm0 & (ymm17 ^ ymm18)) 12366; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 12367; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13] 12368; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero 12369; AVX512-FCP-NEXT: vpor %xmm4, %xmm0, %xmm0 12370; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm29 12371; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm29 = ymm29 ^ (ymm23 & (ymm29 ^ ymm3)) 12372; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] 12373; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 12374; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm2 ^ ymm14)) 12375; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] 12376; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15] 12377; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 12378; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3 12379; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm11 ^ ymm16)) 12380; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 12381; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u] 12382; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u] 12383; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 12384; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~mem) | ymm0 12385; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0 12386; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm18 ^ (ymm0 & (ymm17 ^ ymm18)) 12387; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero 12388; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 12389; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] 12390; AVX512-FCP-NEXT: vpor %xmm7, %xmm0, %xmm0 12391; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm26 12392; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm26 = ymm26 ^ (ymm23 & (ymm26 ^ ymm3)) 12393; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm13 12394; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm20 ^ (ymm13 & (ymm12 ^ ymm20)) 12395; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm7 12396; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm19 = ymm17 ^ (ymm19 & (ymm18 ^ ymm17)) 12397; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm0 12398; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm12 ^ ymm20)) 12399; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm8 12400; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm20 ^ (ymm10 & (ymm12 ^ ymm20)) 12401; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm3 12402; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u] 12403; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u] 12404; AVX512-FCP-NEXT: vpor %xmm3, %xmm12, %xmm3 12405; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm27 ^ ymm31)) 12406; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm12 12407; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] 12408; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u] 12409; AVX512-FCP-NEXT: vpor %xmm0, %xmm12, %xmm0 12410; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm12 12411; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm16 ^ (ymm9 & (ymm11 ^ ymm16)) 12412; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm13 12413; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm27 ^ (ymm13 & (ymm31 ^ ymm27)) 12414; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm27 ^ (ymm9 & (ymm31 ^ ymm27)) 12415; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] 12416; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 12417; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[4,11],zero,zero,xmm10[0,7,14,u,u,u,u,u,u,u] 12418; AVX512-FCP-NEXT: vpor %xmm4, %xmm10, %xmm4 12419; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6],ymm6[7,8],ymm7[9,10,11],ymm6[12],ymm7[13,14],ymm6[15] 12420; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 12421; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm3 & ymm25) 12422; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5,6,7,8],ymm6[9],ymm13[10,11],ymm6[12],ymm13[13,14,15] 12423; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 12424; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm0 & ymm25) 12425; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7,8],ymm6[9],ymm9[10,11,12],ymm6[13],ymm9[14,15] 12426; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 12427; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm4 & ymm25) 12428; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm1 & (ymm2 ^ ymm14)) 12429; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm28 ^ ymm30)) 12430; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm30 ^ (ymm8 & (ymm28 ^ ymm30)) 12431; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm28 ^ (ymm1 & (ymm30 ^ ymm28)) 12432; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] 12433; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm3 12434; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u] 12435; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 12436; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 12437; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] 12438; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm3 12439; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero 12440; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 12441; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 12442; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 12443; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm9 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] 12444; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm9 & (ymm3 ^ ymm0)) 12445; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u] 12446; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm4 12447; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u] 12448; AVX512-FCP-NEXT: vpor %xmm0, %xmm4, %xmm0 12449; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm12, %xmm4 12450; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero 12451; AVX512-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 12452; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 12453; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 12454; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm9 & (ymm4 ^ ymm0)) 12455; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0 12456; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[5,12,u,u,u,u,u] 12457; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u] 12458; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm1 12459; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128] 12460; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm5 12461; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm15[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero 12462; AVX512-FCP-NEXT: vpor %xmm5, %xmm8, %xmm5 12463; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 12464; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 12465; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm9 & (ymm5 ^ ymm1)) 12466; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,2,4,6,0,0,0,0] 12467; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm8 12468; AVX512-FCP-NEXT: vpermd %ymm8, %ymm1, %ymm1 12469; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] 12470; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm1 12471; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 12472; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm3 & (zmm1 ^ zmm10)) 12473; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,4,6,0,0,0,0] 12474; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm9 12475; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] 12476; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm4, %zmm4 12477; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm3 & (zmm4 ^ zmm7)) 12478; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,5,6,0,0,0,0] 12479; AVX512-FCP-NEXT: vpermd %ymm8, %ymm7, %ymm7 12480; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] 12481; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm5 12482; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm3 & (zmm5 ^ zmm6)) 12483; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm3 12484; AVX512-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 12485; AVX512-FCP-NEXT: kmovw %eax, %k1 12486; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} 12487; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm3 12488; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1} 12489; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] 12490; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] 12491; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 12492; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[u,u,u,u,2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u] 12493; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm3 12494; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u] 12495; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 12496; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & mem) | ymm0 12497; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm3 12498; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm0 12499; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] 12500; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero 12501; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 12502; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 12503; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] 12504; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 12505; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 12506; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} 12507; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 12508; AVX512-FCP-NEXT: vmovaps %zmm0, (%rsi) 12509; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%rdx) 12510; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rcx) 12511; AVX512-FCP-NEXT: vmovdqa64 %zmm22, (%r8) 12512; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r9) 12513; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 12514; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) 12515; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 12516; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax) 12517; AVX512-FCP-NEXT: vzeroupper 12518; AVX512-FCP-NEXT: retq 12519; 12520; AVX512DQ-LABEL: load_i8_stride7_vf64: 12521; AVX512DQ: # %bb.0: 12522; AVX512DQ-NEXT: subq $24, %rsp 12523; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] 12524; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm12 12525; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm13 12526; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm31 12527; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 12528; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm23 12529; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm13 ^ (ymm1 & (ymm12 ^ ymm13)) 12530; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 12531; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u] 12532; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] 12533; AVX512DQ-NEXT: vpor %xmm2, %xmm1, %xmm1 12534; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] 12535; AVX512DQ-NEXT: vmovdqa64 96(%rdi), %ymm28 12536; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2 12537; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm31 ^ (ymm2 & (ymm28 ^ ymm31)) 12538; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm11 12539; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm11[2],ymm2[3,4],ymm11[5],ymm2[6,7,8,9],ymm11[10],ymm2[11,12],ymm11[13],ymm2[14,15] 12540; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 12541; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm1 & mem) 12542; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] 12543; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm25 12544; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm4 12545; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm1 12546; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm25 ^ (ymm1 & (ymm4 ^ ymm25)) 12547; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm3 12548; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u] 12549; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] 12550; AVX512DQ-NEXT: vpor %xmm3, %xmm1, %xmm1 12551; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 12552; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm0 12553; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] 12554; AVX512DQ-NEXT: vpshufb %xmm5, %xmm0, %xmm3 12555; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm30 12556; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm20 12557; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm10 12558; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] 12559; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] 12560; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 12561; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] 12562; AVX512DQ-NEXT: vmovdqa 240(%rdi), %xmm3 12563; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u,u,u,u,u] 12564; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm5 12565; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] 12566; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 12567; AVX512DQ-NEXT: vinserti32x4 $2, %xmm6, %zmm1, %zmm22 12568; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 12569; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm19 & (zmm22 ^ zmm2)) 12570; AVX512DQ-NEXT: vmovdqa64 288(%rdi), %ymm18 12571; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %ymm17 12572; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2 12573; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm17 ^ ymm18)) 12574; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u] 12575; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2 12576; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u] 12577; AVX512DQ-NEXT: vpor %xmm6, %xmm2, %xmm2 12578; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %ymm21 12579; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm16 12580; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm7 12581; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm21 ^ (ymm7 & (ymm16 ^ ymm21)) 12582; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] 12583; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7,8,9],ymm8[10],ymm7[11,12,13],ymm8[14],ymm7[15] 12584; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] 12585; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm24 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] 12586; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm2 & ymm24) 12587; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] 12588; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm2 12589; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm13 ^ (ymm2 & (ymm12 ^ ymm13)) 12590; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm15 12591; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u] 12592; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u] 12593; AVX512DQ-NEXT: vpor %xmm2, %xmm15, %xmm2 12594; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm15 12595; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm29 12596; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm28 ^ (ymm15 & (ymm31 ^ ymm28)) 12597; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm11[2],ymm15[3,4,5],ymm11[6],ymm15[7,8,9],ymm11[10],ymm15[11,12,13],ymm11[14],ymm15[15] 12598; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] 12599; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm2 & ~mem) 12600; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2 12601; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm6 12602; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm4 ^ ymm25)) 12603; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u] 12604; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2 12605; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u] 12606; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0 12607; AVX512DQ-NEXT: vmovdqa %xmm3, %xmm4 12608; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u,u,u,u,u] 12609; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] 12610; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm26 12611; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 12612; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] 12613; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm1 12614; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] 12615; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] 12616; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 12617; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 12618; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] 12619; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm23 12620; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm19 & (zmm23 ^ zmm15)) 12621; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0 12622; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm12 ^ ymm13)) 12623; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u] 12624; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 12625; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u] 12626; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0 12627; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm2 12628; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm28 ^ (ymm2 & (ymm31 ^ ymm28)) 12629; AVX512DQ-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12630; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm11[3],ymm2[4,5],ymm11[6],ymm2[7,8,9,10],ymm11[11],ymm2[12,13],ymm11[14],ymm2[15] 12631; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 12632; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm27 = [18446744073709551615,255,18446744073709486080,18446744073709551615] 12633; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ymm27) 12634; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm0 12635; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm15 12636; AVX512DQ-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12637; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm6 ^ (ymm0 & (ymm25 ^ ymm6)) 12638; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm19 12639; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12640; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 12641; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u] 12642; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u] 12643; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0 12644; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 12645; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm3 12646; AVX512DQ-NEXT: vpshufb %xmm3, %xmm10, %xmm3 12647; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] 12648; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm6 12649; AVX512DQ-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12650; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] 12651; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 12652; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] 12653; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm1 12654; AVX512DQ-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12655; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] 12656; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u] 12657; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 12658; AVX512DQ-NEXT: vpor %xmm3, %xmm5, %xmm3 12659; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm25 12660; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm20 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] 12661; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm25 = zmm2 ^ (zmm20 & (zmm25 ^ zmm2)) 12662; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm0 12663; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm12 ^ ymm13)) 12664; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] 12665; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 12666; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] 12667; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0 12668; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm2 12669; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm28 ^ (ymm2 & (ymm31 ^ ymm28)) 12670; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm2[1,2],ymm11[3],ymm2[4,5,6],ymm11[7,8],ymm2[9,10],ymm11[11],ymm2[12,13,14],ymm11[15] 12671; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 12672; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm27) 12673; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm11 12674; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm0 12675; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm19 ^ (ymm0 & (ymm15 ^ ymm19)) 12676; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2 12677; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[2,9,u,u,u,u,u] 12678; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] 12679; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0 12680; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 12681; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] 12682; AVX512DQ-NEXT: vpshufb %xmm2, %xmm10, %xmm5 12683; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero 12684; AVX512DQ-NEXT: vpor %xmm5, %xmm15, %xmm5 12685; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 12686; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm0)) 12687; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] 12688; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u] 12689; AVX512DQ-NEXT: vpor %xmm0, %xmm15, %xmm0 12690; AVX512DQ-NEXT: vmovdqa64 416(%rdi), %ymm26 12691; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm30 12692; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %ymm27 12693; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm30 = zmm3 ^ (zmm20 & (zmm30 ^ zmm3)) 12694; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm0 12695; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26)) 12696; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 12697; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9] 12698; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero 12699; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0 12700; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 12701; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm29 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] 12702; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm29 & (ymm0 ^ ymm8)) 12703; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20 12704; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} zmm8 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0] 12705; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm8 & (zmm20 ^ zmm22)) 12706; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm0 12707; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm18 ^ ymm17)) 12708; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 12709; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u] 12710; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u] 12711; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0 12712; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm3 12713; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm21 ^ (ymm3 & (ymm16 ^ ymm21)) 12714; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] 12715; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] 12716; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] 12717; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm24) 12718; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0 12719; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26)) 12720; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero 12721; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 12722; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10] 12723; AVX512DQ-NEXT: vpor %xmm5, %xmm0, %xmm0 12724; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 12725; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm29 & (ymm0 ^ ymm3)) 12726; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm22 12727; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm8 & (zmm22 ^ zmm23)) 12728; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0 12729; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm18 ^ ymm17)) 12730; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 12731; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u] 12732; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u] 12733; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0 12734; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm3 12735; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm21 ^ ymm16)) 12736; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] 12737; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1,2],ymm5[3],ymm3[4,5,6],ymm5[7,8],ymm3[9,10],ymm5[11],ymm3[12,13,14],ymm5[15] 12738; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] 12739; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm24) 12740; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm0 12741; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26)) 12742; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero 12743; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 12744; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11] 12745; AVX512DQ-NEXT: vpor %xmm5, %xmm0, %xmm0 12746; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 12747; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm29 & (ymm0 ^ ymm3)) 12748; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm23 12749; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm8 & (zmm23 ^ zmm25)) 12750; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0 12751; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm21 ^ ymm16)) 12752; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] 12753; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6],ymm3[7,8],ymm0[9,10,11],ymm3[12],ymm0[13,14],ymm3[15] 12754; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0 12755; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2 12756; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm17 ^ (ymm2 & (ymm18 ^ ymm17)) 12757; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u] 12758; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2 12759; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u] 12760; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 12761; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm24) | ymm0 12762; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm0 12763; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27)) 12764; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 12765; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12] 12766; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero 12767; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0 12768; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 12769; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm29 & (ymm0 ^ ymm2)) 12770; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm25 12771; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm8 & (zmm25 ^ zmm30)) 12772; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm0 12773; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm21 ^ ymm16)) 12774; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] 12775; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15] 12776; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] 12777; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm2 12778; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm17 ^ ymm18)) 12779; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 12780; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u] 12781; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u] 12782; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 12783; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] 12784; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm24) | ymm0 12785; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0 12786; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27)) 12787; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 12788; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13] 12789; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero 12790; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0 12791; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm24 12792; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm24 = ymm24 ^ (ymm29 & (ymm24 ^ ymm2)) 12793; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm0 12794; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm16 ^ ymm21)) 12795; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] 12796; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] 12797; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm2 12798; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm17 ^ ymm18)) 12799; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 12800; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u] 12801; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u] 12802; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 12803; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] 12804; AVX512DQ-NEXT: vpshufb %ymm8, %ymm0, %ymm0 12805; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ~mem) | ymm0 12806; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm0 12807; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27)) 12808; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero 12809; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 12810; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] 12811; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0 12812; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm30 12813; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm30 = ymm30 ^ (ymm29 & (ymm30 ^ ymm2)) 12814; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm0 12815; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm13 ^ ymm12)) 12816; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2 12817; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] 12818; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u] 12819; AVX512DQ-NEXT: vporq %xmm2, %xmm0, %xmm29 12820; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm19 = ymm26 ^ (ymm19 & (ymm27 ^ ymm26)) 12821; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0 12822; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm13 ^ ymm12)) 12823; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2 12824; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u,u,u,u] 12825; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u] 12826; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0 12827; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2 12828; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm15 12829; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm12 ^ (ymm9 & (ymm13 ^ ymm12)) 12830; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm28 ^ (ymm2 & (ymm31 ^ ymm28)) 12831; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u] 12832; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm5 12833; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u] 12834; AVX512DQ-NEXT: vpor %xmm3, %xmm5, %xmm3 12835; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm5 12836; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm17 = ymm18 ^ (ymm14 & (ymm17 ^ ymm18)) 12837; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm9 12838; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm28 ^ ymm31)) 12839; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm31 ^ (ymm14 & (ymm28 ^ ymm31)) 12840; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 12841; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3],ymm4[4],ymm2[5,6],ymm4[7,8],ymm2[9,10,11],ymm4[12],ymm2[13,14],ymm4[15] 12842; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 12843; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm1 12844; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 | (ymm29 & ymm11) 12845; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm4[1],ymm9[2,3],ymm4[4],ymm9[5,6,7,8],ymm4[9],ymm9[10,11],ymm4[12],ymm9[13,14,15] 12846; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 12847; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm0 & ymm1) 12848; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm4[1],ymm14[2,3,4],ymm4[5],ymm14[6,7,8],ymm4[9],ymm14[10,11,12],ymm4[13],ymm14[14,15] 12849; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 12850; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm3 & ymm1) 12851; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 12852; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 12853; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm6 ^ (ymm5 & (ymm13 ^ ymm6)) 12854; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u,u,u] 12855; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm2 12856; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u] 12857; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0 12858; AVX512DQ-NEXT: vmovdqa %xmm10, %xmm1 12859; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] 12860; AVX512DQ-NEXT: vpshufb %xmm2, %xmm10, %xmm2 12861; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 12862; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero 12863; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 12864; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 12865; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 12866; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm18 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] 12867; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm18 & (ymm2 ^ ymm0)) 12868; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm16 = ymm21 ^ (ymm7 & (ymm16 ^ ymm21)) 12869; AVX512DQ-NEXT: vmovd {{.*#+}} xmm10 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] 12870; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm6 ^ (ymm15 & (ymm13 ^ ymm6)) 12871; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm13 ^ (ymm7 & (ymm6 ^ ymm13)) 12872; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 12873; AVX512DQ-NEXT: vpshufb %xmm10, %xmm14, %xmm0 12874; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 12875; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 12876; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] 12877; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2 12878; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 12879; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ zmm12)) 12880; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u] 12881; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm5 12882; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[1,8,15],zero,zero,xmm5[4,11,u,u,u,u,u] 12883; AVX512DQ-NEXT: vpor %xmm3, %xmm5, %xmm3 12884; AVX512DQ-NEXT: vpshufb %xmm8, %xmm1, %xmm5 12885; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero 12886; AVX512DQ-NEXT: vpor %xmm5, %xmm8, %xmm5 12887; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 12888; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 12889; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm18 & (ymm5 ^ ymm3)) 12890; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm3 12891; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u] 12892; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u] 12893; AVX512DQ-NEXT: vpor %xmm3, %xmm7, %xmm3 12894; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128] 12895; AVX512DQ-NEXT: vpshufb %xmm7, %xmm1, %xmm8 12896; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero 12897; AVX512DQ-NEXT: vpor %xmm12, %xmm8, %xmm8 12898; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 12899; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 12900; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm18 & (ymm8 ^ ymm3)) 12901; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 12902; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 12903; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3] 12904; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 12905; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ zmm11)) 12906; AVX512DQ-NEXT: vpshufb %xmm10, %xmm13, %xmm5 12907; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 12908; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] 12909; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm5 12910; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm9)) 12911; AVX512DQ-NEXT: movw $-512, %ax # imm = 0xFE00 12912; AVX512DQ-NEXT: kmovw %eax, %k1 12913; AVX512DQ-NEXT: vinserti32x8 $1, %ymm24, %zmm0, %zmm2 {%k1} 12914; AVX512DQ-NEXT: vinserti32x8 $1, %ymm30, %zmm0, %zmm3 {%k1} 12915; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm1 12916; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm16[2,3,0,1] 12917; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] 12918; AVX512DQ-NEXT: vpshufb %ymm7, %ymm0, %ymm0 12919; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm1 12920; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u] 12921; AVX512DQ-NEXT: vextracti32x4 $1, %ymm17, %xmm1 12922; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u] 12923; AVX512DQ-NEXT: vpor %xmm6, %xmm1, %xmm1 12924; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm0 12925; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm4 12926; AVX512DQ-NEXT: vextracti32x4 $1, %ymm19, %xmm0 12927; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] 12928; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero 12929; AVX512DQ-NEXT: vpor %xmm0, %xmm4, %xmm0 12930; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 12931; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] 12932; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 12933; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm5 {%k1} 12934; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%rsi) 12935; AVX512DQ-NEXT: vmovdqa64 %zmm22, (%rdx) 12936; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%rcx) 12937; AVX512DQ-NEXT: vmovdqa64 %zmm25, (%r8) 12938; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%r9) 12939; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 12940; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax) 12941; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 12942; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rax) 12943; AVX512DQ-NEXT: addq $24, %rsp 12944; AVX512DQ-NEXT: vzeroupper 12945; AVX512DQ-NEXT: retq 12946; 12947; AVX512DQ-FCP-LABEL: load_i8_stride7_vf64: 12948; AVX512DQ-FCP: # %bb.0: 12949; AVX512DQ-FCP-NEXT: pushq %rax 12950; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] 12951; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm11 12952; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 12953; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27 12954; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 12955; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm11 ^ ymm12)) 12956; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 12957; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] 12958; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] 12959; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm1 12960; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] 12961; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm31 12962; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2 12963; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm27 ^ (ymm2 & (ymm31 ^ ymm27)) 12964; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 12965; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] 12966; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm20 12967; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 12968; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm1 & mem) 12969; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] 12970; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm28 12971; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm30 12972; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm1 12973; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm28 ^ (ymm1 & (ymm30 ^ ymm28)) 12974; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 12975; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u] 12976; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] 12977; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1 12978; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 12979; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,2,4,6] 12980; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm22 12981; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm4, %ymm4 12982; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] 12983; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] 12984; AVX512DQ-FCP-NEXT: vmovdqa 240(%rdi), %xmm4 12985; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u] 12986; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %xmm5 12987; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] 12988; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 12989; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm1, %zmm7 12990; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 12991; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm2)) 12992; AVX512DQ-FCP-NEXT: vmovdqa64 288(%rdi), %ymm16 12993; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm1 12994; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2 12995; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm16 ^ (ymm2 & (ymm1 ^ ymm16)) 12996; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u] 12997; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 12998; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u] 12999; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm10, %xmm10 13000; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm14 13001; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm2 13002; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm13 13003; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm14 ^ (ymm13 & (ymm2 ^ ymm14)) 13004; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm13[2,3,0,1] 13005; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4,5],ymm8[6],ymm13[7,8,9],ymm8[10],ymm13[11,12,13],ymm8[14],ymm13[15] 13006; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] 13007; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] 13008; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm10 & ymm26) 13009; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] 13010; AVX512DQ-FCP-NEXT: vmovdqa64 416(%rdi), %ymm17 13011; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %ymm18 13012; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm13 13013; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm17 ^ (ymm13 & (ymm18 ^ ymm17)) 13014; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 13015; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9] 13016; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero 13017; AVX512DQ-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 13018; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 13019; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] 13020; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm21 & (ymm13 ^ ymm8)) 13021; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm3 13022; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} zmm29 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0] 13023; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm29 & (zmm3 ^ zmm7)) 13024; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 13025; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm7 13026; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm11 ^ ymm12)) 13027; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 13028; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u,u,u] 13029; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,8,15],zero,zero,xmm7[4,11],zero,zero,xmm7[u,u,u,u,u,u,u] 13030; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 13031; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm8 13032; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31)) 13033; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm3 13034; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm3[2],ymm8[3,4,5],ymm3[6],ymm8[7,8,9],ymm3[10],ymm8[11,12,13],ymm3[14],ymm8[15] 13035; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] 13036; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ~mem) 13037; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm7 13038; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm28 ^ (ymm7 & (ymm30 ^ ymm28)) 13039; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,6,13],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[u,u,u,u] 13040; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 13041; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u],zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u] 13042; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7 13043; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 13044; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,4,6] 13045; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm13, %ymm13 13046; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] 13047; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7] 13048; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u] 13049; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] 13050; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13 13051; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm7, %zmm7 13052; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm8)) 13053; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm8 13054; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm16 ^ ymm1)) 13055; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm13 13056; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u] 13057; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u] 13058; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm8, %xmm8 13059; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm13 13060; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm14 ^ (ymm13 & (ymm2 ^ ymm14)) 13061; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] 13062; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8,9,10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] 13063; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] 13064; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm8 & ymm26) 13065; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm8 13066; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm17 ^ (ymm8 & (ymm18 ^ ymm17)) 13067; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero 13068; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 13069; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10] 13070; AVX512DQ-FCP-NEXT: vpor %xmm15, %xmm8, %xmm8 13071; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 13072; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm21 & (ymm8 ^ ymm13)) 13073; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm24 13074; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm29 & (zmm24 ^ zmm7)) 13075; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm7 13076; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm11 ^ ymm12)) 13077; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u] 13078; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 13079; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] 13080; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 13081; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm8 13082; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31)) 13083; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7,8,9,10],ymm3[11],ymm8[12,13],ymm3[14],ymm8[15] 13084; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm0 13085; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 13086; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 13087; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm23 = [18446744073709551615,255,18446744073709486080,18446744073709551615] 13088; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm7 & ymm23) 13089; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm7 13090; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm28 ^ ymm30)) 13091; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 13092; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u] 13093; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u] 13094; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 13095; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 13096; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,1,3,5,6] 13097; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm8, %ymm3 13098; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] 13099; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,6],ymm3[7] 13100; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] 13101; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u] 13102; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 13103; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3 13104; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm22 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] 13105; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm13 ^ (zmm22 & (zmm3 ^ zmm13)) 13106; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm7 13107; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm1 ^ (ymm7 & (ymm16 ^ ymm1)) 13108; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm13 13109; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[6,13,u,u] 13110; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u] 13111; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7 13112; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm13 13113; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm2 ^ (ymm13 & (ymm14 ^ ymm2)) 13114; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] 13115; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5,6],ymm15[7,8],ymm13[9,10],ymm15[11],ymm13[12,13,14],ymm15[15] 13116; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] 13117; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm7 & ymm26) 13118; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm7 13119; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm18 ^ ymm17)) 13120; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero 13121; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 13122; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11] 13123; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7 13124; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 13125; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm21 & (ymm7 ^ ymm13)) 13126; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm20 13127; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm29 & (zmm20 ^ zmm3)) 13128; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3 13129; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm12 ^ (ymm3 & (ymm11 ^ ymm12)) 13130; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u,u,u] 13131; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 13132; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u,u,u] 13133; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 13134; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm7 13135; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm27 ^ ymm31)) 13136; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm7[1,2],ymm0[3],ymm7[4,5,6],ymm0[7,8],ymm7[9,10],ymm0[11],ymm7[12,13,14],ymm0[15] 13137; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 13138; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm23) 13139; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm3 13140; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm30 ^ (ymm3 & (ymm28 ^ ymm30)) 13141; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm13 13142; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u] 13143; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u] 13144; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm13, %xmm13 13145; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] 13146; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm0 13147; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13148; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm8 13149; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm0 13150; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 13151; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero 13152; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3 13153; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm8 13154; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 13155; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm25 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] 13156; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm25 & (ymm3 ^ ymm8)) 13157; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] 13158; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u] 13159; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 13160; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm3 13161; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm7 ^ (zmm22 & (zmm3 ^ zmm7)) 13162; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm4 13163; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm2 ^ (ymm4 & (ymm14 ^ ymm2)) 13164; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] 13165; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3],ymm5[4],ymm4[5,6],ymm5[7,8],ymm4[9,10,11],ymm5[12],ymm4[13,14],ymm5[15] 13166; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm4 13167; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm5 13168; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm1 ^ (ymm5 & (ymm16 ^ ymm1)) 13169; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u] 13170; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 13171; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u] 13172; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm5, %xmm5 13173; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm26) | ymm4 13174; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm4 13175; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm18 ^ (ymm4 & (ymm17 ^ ymm18)) 13176; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm7 13177; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12] 13178; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero 13179; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm4, %xmm4 13180; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 13181; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm21 & (ymm4 ^ ymm5)) 13182; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm22 13183; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm29 & (zmm22 ^ zmm3)) 13184; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3 13185; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm14 ^ ymm2)) 13186; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] 13187; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15] 13188; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] 13189; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 13190; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm16 ^ (ymm4 & (ymm1 ^ ymm16)) 13191; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 13192; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u] 13193; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[u,u] 13194; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 13195; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ymm26) | ymm3 13196; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm3 13197; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm17 ^ ymm18)) 13198; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 13199; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13] 13200; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero 13201; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 13202; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm26 13203; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm26 = ymm26 ^ (ymm21 & (ymm26 ^ ymm4)) 13204; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm3 13205; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm14 ^ (ymm3 & (ymm2 ^ ymm14)) 13206; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] 13207; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] 13208; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] 13209; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 13210; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm4 13211; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm16 ^ (ymm4 & (ymm1 ^ ymm16)) 13212; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm7 13213; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u] 13214; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u] 13215; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm4, %xmm7 13216; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = (ymm7 & ~mem) | ymm3 13217; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3 13218; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm17 ^ ymm18)) 13219; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero 13220; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 13221; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14] 13222; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 13223; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm29 13224; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm29 = ymm29 ^ (ymm21 & (ymm29 ^ ymm7)) 13225; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm13 13226; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm11 ^ (ymm13 & (ymm12 ^ ymm11)) 13227; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm8 13228; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm19 = ymm17 ^ (ymm19 & (ymm18 ^ ymm17)) 13229; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm3 13230; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm12 ^ ymm11)) 13231; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm7 13232; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm11 ^ (ymm6 & (ymm12 ^ ymm11)) 13233; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm11 13234; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] 13235; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u] 13236; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm12, %xmm13 13237; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31)) 13238; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm11 13239; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] 13240; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u] 13241; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3 13242; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm12 13243; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm16 ^ (ymm9 & (ymm1 ^ ymm16)) 13244; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm15 13245; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm27 ^ (ymm15 & (ymm31 ^ ymm27)) 13246; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm27 ^ (ymm9 & (ymm31 ^ ymm27)) 13247; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u] 13248; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm0 13249; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] 13250; AVX512DQ-FCP-NEXT: vporq %xmm11, %xmm0, %xmm16 13251; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 13252; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm8[1,2,3],ymm4[4],ymm8[5,6],ymm4[7,8],ymm8[9,10,11],ymm4[12],ymm8[13,14],ymm4[15] 13253; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 13254; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm13 & ymm23) 13255; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13,14,15] 13256; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 13257; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm3 & ymm23) 13258; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm4[1],ymm9[2,3,4],ymm4[5],ymm9[6,7,8],ymm4[9],ymm9[10,11,12],ymm4[13],ymm9[14,15] 13259; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 13260; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm16 & ymm23) 13261; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm10 & (ymm2 ^ ymm14)) 13262; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm28 ^ ymm30)) 13263; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm28 ^ ymm30)) 13264; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm28 ^ (ymm10 & (ymm30 ^ ymm28)) 13265; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] 13266; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm9 13267; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10,u,u,u,u,u] 13268; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm9, %xmm3 13269; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 13270; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] 13271; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm9 13272; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 13273; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero 13274; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm12, %xmm9 13275; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 13276; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 13277; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm25 & (ymm9 ^ ymm3)) 13278; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u] 13279; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 13280; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u] 13281; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm7, %xmm3 13282; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm5 13283; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero 13284; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm5, %xmm5 13285; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 13286; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm7 13287; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm25 & (ymm7 ^ ymm3)) 13288; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm3 13289; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u] 13290; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u] 13291; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 13292; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128] 13293; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm10 13294; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero 13295; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm10, %xmm10 13296; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 13297; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 13298; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 ^ (ymm25 & (ymm10 ^ ymm3)) 13299; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,2,4,6,0,0,0,0] 13300; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm12 13301; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm3 13302; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] 13303; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm9, %zmm3 13304; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 13305; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm9 & (zmm3 ^ zmm11)) 13306; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,4,6,0,0,0,0] 13307; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm11, %ymm11 13308; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] 13309; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm7, %zmm7 13310; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm9 & (zmm7 ^ zmm8)) 13311; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,3,5,6,0,0,0,0] 13312; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm8, %ymm8 13313; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] 13314; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm10, %zmm8 13315; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm9 & (zmm8 ^ zmm0)) 13316; AVX512DQ-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 13317; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 13318; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm26, %zmm0, %zmm3 {%k1} 13319; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm29, %zmm0, %zmm7 {%k1} 13320; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1] 13321; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] 13322; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 13323; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u] 13324; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 13325; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u] 13326; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 13327; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm0 13328; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm2 13329; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm0 13330; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] 13331; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero 13332; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 13333; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 13334; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] 13335; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 13336; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm8 {%k1} 13337; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload 13338; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rsi) 13339; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%rdx) 13340; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rcx) 13341; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, (%r8) 13342; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%r9) 13343; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 13344; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rax) 13345; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 13346; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rax) 13347; AVX512DQ-FCP-NEXT: popq %rax 13348; AVX512DQ-FCP-NEXT: vzeroupper 13349; AVX512DQ-FCP-NEXT: retq 13350; 13351; AVX512BW-LABEL: load_i8_stride7_vf64: 13352; AVX512BW: # %bb.0: 13353; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm25 13354; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0] 13355; AVX512BW-NEXT: vpermw %zmm25, %zmm0, %zmm18 13356; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15] 13357; AVX512BW-NEXT: vpermw %zmm25, %zmm0, %zmm24 13358; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31] 13359; AVX512BW-NEXT: vpermw %zmm25, %zmm0, %zmm9 13360; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31] 13361; AVX512BW-NEXT: vpermw %zmm25, %zmm0, %zmm0 13362; AVX512BW-NEXT: vmovdqa (%rdi), %ymm10 13363; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 13364; AVX512BW-NEXT: movw $-28382, %ax # imm = 0x9122 13365; AVX512BW-NEXT: kmovd %eax, %k1 13366; AVX512BW-NEXT: vpblendmw %ymm1, %ymm10, %ymm3 {%k1} 13367; AVX512BW-NEXT: kmovq %k1, %k2 13368; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 13369; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4 13370; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] 13371; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] 13372; AVX512BW-NEXT: vporq %xmm4, %xmm3, %xmm16 13373; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] 13374; AVX512BW-NEXT: movw $992, %ax # imm = 0x3E0 13375; AVX512BW-NEXT: kmovd %eax, %k1 13376; AVX512BW-NEXT: vmovdqu16 %ymm0, %ymm16 {%k1} 13377; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm11 13378; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm6 13379; AVX512BW-NEXT: movw $8772, %ax # imm = 0x2244 13380; AVX512BW-NEXT: kmovd %eax, %k6 13381; AVX512BW-NEXT: vpblendmw %ymm11, %ymm6, %ymm0 {%k6} 13382; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm3 13383; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u] 13384; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u] 13385; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0 13386; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 13387; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm7 13388; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] 13389; AVX512BW-NEXT: vpshufb %xmm21, %xmm7, %xmm3 13390; AVX512BW-NEXT: vmovdqa 208(%rdi), %xmm8 13391; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] 13392; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] 13393; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 13394; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] 13395; AVX512BW-NEXT: vmovdqa64 240(%rdi), %xmm26 13396; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u] 13397; AVX512BW-NEXT: vmovdqa 224(%rdi), %xmm4 13398; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] 13399; AVX512BW-NEXT: vpor %xmm5, %xmm12, %xmm5 13400; AVX512BW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm0 13401; AVX512BW-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 13402; AVX512BW-NEXT: kmovq %rax, %k5 13403; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k5} 13404; AVX512BW-NEXT: vmovdqa 288(%rdi), %ymm13 13405; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm12 13406; AVX512BW-NEXT: movw $9288, %ax # imm = 0x2448 13407; AVX512BW-NEXT: kmovd %eax, %k3 13408; AVX512BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k3} 13409; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u] 13410; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 13411; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u] 13412; AVX512BW-NEXT: vporq %xmm5, %xmm0, %xmm19 13413; AVX512BW-NEXT: vmovdqa64 352(%rdi), %ymm17 13414; AVX512BW-NEXT: vmovdqa 320(%rdi), %ymm0 13415; AVX512BW-NEXT: vpblendmw %ymm17, %ymm0, %ymm5 {%k6} 13416; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm5[2,3,0,1] 13417; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4,5],ymm15[6],ymm5[7,8,9],ymm15[10],ymm5[11,12,13],ymm15[14],ymm5[15] 13418; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] 13419; AVX512BW-NEXT: movw $3968, %ax # imm = 0xF80 13420; AVX512BW-NEXT: kmovd %eax, %k7 13421; AVX512BW-NEXT: vmovdqu16 %ymm5, %ymm19 {%k7} 13422; AVX512BW-NEXT: vmovdqa 416(%rdi), %ymm15 13423; AVX512BW-NEXT: vmovdqa 384(%rdi), %ymm5 13424; AVX512BW-NEXT: movw $4644, %ax # imm = 0x1224 13425; AVX512BW-NEXT: kmovd %eax, %k4 13426; AVX512BW-NEXT: vpblendmw %ymm15, %ymm5, %ymm20 {%k4} 13427; AVX512BW-NEXT: vextracti32x4 $1, %ymm20, %xmm22 13428; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] 13429; AVX512BW-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero 13430; AVX512BW-NEXT: vporq %xmm22, %xmm20, %xmm20 13431; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 13432; AVX512BW-NEXT: movl $-8388608, %eax # imm = 0xFF800000 13433; AVX512BW-NEXT: vpblendmw %ymm1, %ymm10, %ymm22 {%k4} 13434; AVX512BW-NEXT: vextracti32x4 $1, %ymm22, %xmm23 13435; AVX512BW-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] 13436; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[1,8,15],zero,zero,xmm22[4,11],zero,zero,xmm22[u,u,u,u,u,u,u] 13437; AVX512BW-NEXT: vporq %xmm23, %xmm22, %xmm22 13438; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] 13439; AVX512BW-NEXT: movl $511, %edi # imm = 0x1FF 13440; AVX512BW-NEXT: kmovd %edi, %k1 13441; AVX512BW-NEXT: vmovdqu8 %ymm22, %ymm9 {%k1} 13442; AVX512BW-NEXT: vpblendmw %ymm11, %ymm6, %ymm22 {%k3} 13443; AVX512BW-NEXT: vpshufb {{.*#+}} xmm23 = xmm22[u,u,u,6,13],zero,zero,xmm22[2,9],zero,zero,zero,xmm22[u,u,u,u] 13444; AVX512BW-NEXT: vextracti32x4 $1, %ymm22, %xmm22 13445; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u],zero,zero,xmm22[4,11],zero,zero,xmm22[0,7,14,u,u,u,u] 13446; AVX512BW-NEXT: vporq %xmm23, %xmm22, %xmm22 13447; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm14 13448; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] 13449; AVX512BW-NEXT: vpshufb {{.*#+}} xmm23 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] 13450; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm22 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3] 13451; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm2 13452; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm2[7] 13453; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u] 13454; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] 13455; AVX512BW-NEXT: vporq %xmm14, %xmm22, %xmm14 13456; AVX512BW-NEXT: vinserti32x4 $2, %xmm14, %zmm2, %zmm2 13457; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm9 {%k5} 13458; AVX512BW-NEXT: vpblendmw %ymm1, %ymm10, %ymm2 {%k6} 13459; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] 13460; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 13461; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] 13462; AVX512BW-NEXT: vpor %xmm2, %xmm14, %xmm2 13463; AVX512BW-NEXT: movl $261632, %edi # imm = 0x3FE00 13464; AVX512BW-NEXT: kmovd %edi, %k5 13465; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 13466; AVX512BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm14 {%k2} 13467; AVX512BW-NEXT: vextracti32x4 $1, %ymm14, %xmm22 13468; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[1,8,15,u,u,u,u] 13469; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] 13470; AVX512BW-NEXT: vporq %xmm22, %xmm14, %xmm14 13471; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 13472; AVX512BW-NEXT: vpshufb %xmm21, %xmm8, %xmm21 13473; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] 13474; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3] 13475; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm3 13476; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,6],ymm3[7] 13477; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] 13478; AVX512BW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u] 13479; AVX512BW-NEXT: vporq %xmm14, %xmm21, %xmm14 13480; AVX512BW-NEXT: vinserti32x4 $2, %xmm14, %zmm3, %zmm22 13481; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm22 {%k1} 13482; AVX512BW-NEXT: vpblendmw %ymm1, %ymm10, %ymm2 {%k3} 13483; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] 13484; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 13485; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] 13486; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 13487; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 13488; AVX512BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k4} 13489; AVX512BW-NEXT: vextracti32x4 $1, %ymm3, %xmm18 13490; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u] 13491; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u] 13492; AVX512BW-NEXT: vporq %xmm18, %xmm3, %xmm3 13493; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 13494; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12] 13495; AVX512BW-NEXT: vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero 13496; AVX512BW-NEXT: vporq %xmm18, %xmm21, %xmm18 13497; AVX512BW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 13498; AVX512BW-NEXT: movl $-134217728, %edi # imm = 0xF8000000 13499; AVX512BW-NEXT: kmovd %edi, %k2 13500; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 13501; AVX512BW-NEXT: vmovdqu8 %ymm18, %ymm3 {%k2} 13502; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] 13503; AVX512BW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u] 13504; AVX512BW-NEXT: vporq %xmm18, %xmm21, %xmm18 13505; AVX512BW-NEXT: vinserti32x4 $2, %xmm18, %zmm3, %zmm18 13506; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm18 {%k1} 13507; AVX512BW-NEXT: kmovd %eax, %k2 13508; AVX512BW-NEXT: vmovdqu8 %ymm20, %ymm19 {%k2} 13509; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 13510; AVX512BW-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 13511; AVX512BW-NEXT: kmovq %rax, %k1 13512; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm16 {%k1} 13513; AVX512BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k4} 13514; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 13515; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u] 13516; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u] 13517; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 13518; AVX512BW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k3} 13519; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] 13520; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7,8,9,10],ymm14[11],ymm3[12,13],ymm14[14],ymm3[15] 13521; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] 13522; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} 13523; AVX512BW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k6} 13524; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero 13525; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm3 13526; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10] 13527; AVX512BW-NEXT: vpor %xmm3, %xmm14, %xmm3 13528; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 13529; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} 13530; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 13531; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm9 {%k1} 13532; AVX512BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k6} 13533; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 13534; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u] 13535; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u] 13536; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 13537; AVX512BW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k4} 13538; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] 13539; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2],ymm14[3],ymm3[4,5,6],ymm14[7,8],ymm3[9,10],ymm14[11],ymm3[12,13,14],ymm14[15] 13540; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] 13541; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} 13542; AVX512BW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k3} 13543; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero 13544; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm3 13545; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] 13546; AVX512BW-NEXT: vpor %xmm3, %xmm14, %xmm3 13547; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 13548; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} 13549; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 13550; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm22 {%k1} 13551; AVX512BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k3} 13552; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u] 13553; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 13554; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u] 13555; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 13556; AVX512BW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k6} 13557; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] 13558; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2,3],ymm14[4],ymm3[5,6],ymm14[7,8],ymm3[9,10,11],ymm14[12],ymm3[13,14],ymm14[15] 13559; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] 13560; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} 13561; AVX512BW-NEXT: vpblendmw %ymm5, %ymm15, %ymm3 {%k4} 13562; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm14 13563; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12] 13564; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero 13565; AVX512BW-NEXT: vpor %xmm3, %xmm14, %xmm3 13566; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 13567; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} 13568; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 13569; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm18 {%k1} 13570; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 13571; AVX512BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k1} 13572; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 13573; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u] 13574; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u] 13575; AVX512BW-NEXT: vporq %xmm3, %xmm2, %xmm19 13576; AVX512BW-NEXT: vpblendmw %ymm0, %ymm17, %ymm2 {%k3} 13577; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] 13578; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] 13579; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] 13580; AVX512BW-NEXT: vmovdqu16 %ymm2, %ymm19 {%k7} 13581; AVX512BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k4} 13582; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 13583; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u] 13584; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u] 13585; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 13586; AVX512BW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k1} 13587; AVX512BW-NEXT: kmovq %k1, %k7 13588; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] 13589; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7,8],ymm14[9],ymm3[10,11,12],ymm14[13],ymm3[14,15] 13590; AVX512BW-NEXT: vpshufb {{.*#+}} ymm20 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] 13591; AVX512BW-NEXT: movl $8176, %eax # imm = 0x1FF0 13592; AVX512BW-NEXT: kmovd %eax, %k1 13593; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k1} 13594; AVX512BW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k6} 13595; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 13596; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13] 13597; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero 13598; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 13599; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 13600; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm19 {%k2} 13601; AVX512BW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k3} 13602; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero 13603; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 13604; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14] 13605; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 13606; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 13607; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k2} 13608; AVX512BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm2 {%k6} 13609; AVX512BW-NEXT: vpblendmw %ymm10, %ymm1, %ymm21 {%k6} 13610; AVX512BW-NEXT: vmovdqu16 %ymm17, %ymm0 {%k4} 13611; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] 13612; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] 13613; AVX512BW-NEXT: vmovdqu16 %ymm13, %ymm12 {%k6} 13614; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u] 13615; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm12 13616; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u] 13617; AVX512BW-NEXT: vpor %xmm0, %xmm12, %xmm0 13618; AVX512BW-NEXT: movl $4186112, %eax # imm = 0x3FE000 13619; AVX512BW-NEXT: kmovd %eax, %k1 13620; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] 13621; AVX512BW-NEXT: vmovdqu16 %ymm15, %ymm5 {%k7} 13622; AVX512BW-NEXT: vpblendmw %ymm10, %ymm1, %ymm12 {%k4} 13623; AVX512BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k3} 13624; AVX512BW-NEXT: vmovdqu16 %ymm10, %ymm1 {%k3} 13625; AVX512BW-NEXT: vmovdqu16 %ymm11, %ymm6 {%k4} 13626; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u] 13627; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 13628; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u] 13629; AVX512BW-NEXT: vpor %xmm2, %xmm10, %xmm2 13630; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 13631; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13] 13632; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero 13633; AVX512BW-NEXT: vpor %xmm10, %xmm11, %xmm10 13634; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 13635; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload 13636; AVX512BW-NEXT: vmovdqu8 %ymm10, %ymm2 {%k1} 13637; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u] 13638; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm3 13639; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u] 13640; AVX512BW-NEXT: vpor %xmm3, %xmm10, %xmm3 13641; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 13642; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero 13643; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14] 13644; AVX512BW-NEXT: vpor %xmm10, %xmm11, %xmm10 13645; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 13646; AVX512BW-NEXT: vmovdqu8 %ymm10, %ymm3 {%k1} 13647; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm10 13648; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u] 13649; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] 13650; AVX512BW-NEXT: vpor %xmm6, %xmm10, %xmm6 13651; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 13652; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero 13653; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15] 13654; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 13655; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 13656; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1} 13657; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31] 13658; AVX512BW-NEXT: vpermw %zmm25, %zmm7, %zmm7 13659; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31] 13660; AVX512BW-NEXT: vpermw %zmm25, %zmm8, %zmm8 13661; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0] 13662; AVX512BW-NEXT: vpermw %zmm25, %zmm10, %zmm10 13663; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm11 13664; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] 13665; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,11],zero,zero,xmm12[0,7,14],zero,zero,xmm12[u,u,u,u,u,u,u] 13666; AVX512BW-NEXT: vpor %xmm11, %xmm12, %xmm11 13667; AVX512BW-NEXT: vpshufb {{.*#+}} ymm11 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 13668; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] 13669; AVX512BW-NEXT: vpshufb %xmm10, %xmm4, %xmm12 13670; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 13671; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] 13672; AVX512BW-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm2 13673; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm11 {%k5} 13674; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 13675; AVX512BW-NEXT: movw $-512, %ax # imm = 0xFE00 13676; AVX512BW-NEXT: vextracti32x4 $1, %ymm21, %xmm12 13677; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] 13678; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm21[5,12],zero,zero,xmm21[1,8,15],zero,zero,xmm21[u,u,u,u,u,u,u] 13679; AVX512BW-NEXT: vpor %xmm12, %xmm13, %xmm12 13680; AVX512BW-NEXT: vpshufb {{.*#+}} ymm12 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 13681; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 13682; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 13683; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] 13684; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 13685; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm12 {%k5} 13686; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] 13687; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 13688; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] 13689; AVX512BW-NEXT: vpor %xmm3, %xmm1, %xmm1 13690; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 13691; AVX512BW-NEXT: vpshufb %xmm10, %xmm26, %xmm3 13692; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 13693; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 13694; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 13695; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm1 {%k5} 13696; AVX512BW-NEXT: kmovd %eax, %k1 13697; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm11 {%k1} 13698; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm2 13699; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm12 {%k1} 13700; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm2 13701; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15] 13702; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero 13703; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 13704; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 13705; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] 13706; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 13707; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 13708; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 13709; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 13710; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rdi 13711; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rsi) 13712; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx) 13713; AVX512BW-NEXT: vmovdqa64 %zmm22, (%rcx) 13714; AVX512BW-NEXT: vmovdqa64 %zmm18, (%r8) 13715; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r9) 13716; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rdi) 13717; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) 13718; AVX512BW-NEXT: vzeroupper 13719; AVX512BW-NEXT: retq 13720; 13721; AVX512BW-FCP-LABEL: load_i8_stride7_vf64: 13722; AVX512BW-FCP: # %bb.0: 13723; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 13724; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 13725; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] 13726; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm24 13727; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] 13728; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm13 13729; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] 13730; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm25 13731; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] 13732; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm12 13733; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] 13734; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm16 13735; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] 13736; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm8 13737; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] 13738; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm4 13739; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] 13740; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm5 13741; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm10 13742; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 13743; AVX512BW-FCP-NEXT: movw $-28382, %ax # imm = 0x9122 13744; AVX512BW-FCP-NEXT: kmovd %eax, %k1 13745; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm1 {%k1} 13746; AVX512BW-FCP-NEXT: kmovq %k1, %k2 13747; AVX512BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 13748; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 13749; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u,u,u] 13750; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] 13751; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1 13752; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] 13753; AVX512BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0 13754; AVX512BW-FCP-NEXT: kmovd %eax, %k1 13755; AVX512BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm1 {%k1} 13756; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 13757; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm9 13758; AVX512BW-FCP-NEXT: movw $8772, %ax # imm = 0x2244 13759; AVX512BW-FCP-NEXT: kmovd %eax, %k1 13760; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm5 {%k1} 13761; AVX512BW-FCP-NEXT: kmovq %k1, %k3 13762; AVX512BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 13763; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 13764; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13,u,u,u,u] 13765; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,5,12],zero,zero,xmm5[1,8,15],zero,zero,xmm5[u,u,u,u] 13766; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 13767; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 13768; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,1,2,4,6] 13769; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17 13770; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm6, %ymm6 13771; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] 13772; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] 13773; AVX512BW-FCP-NEXT: vmovdqa64 240(%rdi), %xmm19 13774; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u] 13775; AVX512BW-FCP-NEXT: vmovdqa64 224(%rdi), %xmm20 13776; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] 13777; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 13778; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm5 13779; AVX512BW-FCP-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 13780; AVX512BW-FCP-NEXT: kmovq %rax, %k5 13781; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm1 {%k5} 13782; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm6 13783; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm5 13784; AVX512BW-FCP-NEXT: movw $9288, %ax # imm = 0x2448 13785; AVX512BW-FCP-NEXT: kmovd %eax, %k6 13786; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm7 {%k6} 13787; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u] 13788; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 13789; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u] 13790; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm7, %xmm21 13791; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] 13792; AVX512BW-FCP-NEXT: movw $3968, %ax # imm = 0xF80 13793; AVX512BW-FCP-NEXT: kmovd %eax, %k7 13794; AVX512BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm21 {%k7} 13795; AVX512BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm7 13796; AVX512BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm4 13797; AVX512BW-FCP-NEXT: movw $4644, %ax # imm = 0x1224 13798; AVX512BW-FCP-NEXT: kmovd %eax, %k4 13799; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm18 {%k4} 13800; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm22 13801; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] 13802; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero 13803; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm18, %xmm18 13804; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm22 13805; AVX512BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000 13806; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm18 {%k4} 13807; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm23 13808; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] 13809; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u,u,u] 13810; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18 13811; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] 13812; AVX512BW-FCP-NEXT: movl $511, %r10d # imm = 0x1FF 13813; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 13814; AVX512BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm8 {%k1} 13815; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm18 {%k6} 13816; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u] 13817; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18 13818; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14,u,u,u,u] 13819; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18 13820; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm14 13821; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,0,0,0,1,3,4,6] 13822; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm18, %ymm18 13823; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] 13824; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] 13825; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u] 13826; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] 13827; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm18, %xmm15 13828; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14 13829; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k5} 13830; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k3} 13831; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u] 13832; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 13833; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u] 13834; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 13835; AVX512BW-FCP-NEXT: movl $261632, %r10d # imm = 0x3FE00 13836; AVX512BW-FCP-NEXT: kmovd %r10d, %k5 13837; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 13838; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm12 {%k2} 13839; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15 13840; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u] 13841; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] 13842; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm12, %xmm12 13843; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 13844; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,5,6] 13845; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm15, %ymm15 13846; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] 13847; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7] 13848; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] 13849; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u] 13850; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15 13851; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 13852; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1} 13853; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k6} 13854; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] 13855; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 13856; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] 13857; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 13858; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 13859; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm13 {%k4} 13860; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 13861; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u] 13862; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u] 13863; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 13864; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 13865; AVX512BW-FCP-NEXT: vmovdqa64 208(%rdi), %xmm17 13866; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[5,12] 13867; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm18 13868; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero 13869; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm23, %xmm15 13870; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 13871; AVX512BW-FCP-NEXT: movl $-134217728, %r10d # imm = 0xF8000000 13872; AVX512BW-FCP-NEXT: kmovd %r10d, %k2 13873; AVX512BW-FCP-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 13874; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm13 {%k2} 13875; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] 13876; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm19[1,8,15,u,u,u,u,u,u,u,u,u,u,u] 13877; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm19, %xmm15 13878; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm13, %zmm13 13879; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm13 {%k1} 13880; AVX512BW-FCP-NEXT: kmovd %eax, %k3 13881; AVX512BW-FCP-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3} 13882; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm14 13883; AVX512BW-FCP-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 13884; AVX512BW-FCP-NEXT: kmovq %rax, %k2 13885; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm1 {%k2} 13886; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm14 {%k4} 13887; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 13888; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u] 13889; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u] 13890; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 13891; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] 13892; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} 13893; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 13894; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm15 {%k1} 13895; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero 13896; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 13897; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10] 13898; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 13899; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 13900; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} 13901; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 13902; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k2} 13903; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm14 {%k1} 13904; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 13905; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u] 13906; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u] 13907; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 13908; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] 13909; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} 13910; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm15 {%k6} 13911; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero 13912; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 13913; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11] 13914; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 13915; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 13916; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} 13917; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 13918; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm12 {%k2} 13919; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm14 {%k6} 13920; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u] 13921; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 13922; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u] 13923; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 13924; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] 13925; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} 13926; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm15 {%k4} 13927; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 13928; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12] 13929; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero 13930; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 13931; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 13932; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} 13933; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 13934; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm13 {%k2} 13935; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] 13936; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm15 13937; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] 13938; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm16 13939; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload 13940; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm14 {%k2} 13941; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm19 13942; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u] 13943; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u] 13944; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm14, %xmm14 13945; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] 13946; AVX512BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm14 {%k7} 13947; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm16 {%k4} 13948; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19 13949; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u] 13950; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u] 13951; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 13952; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] 13953; AVX512BW-FCP-NEXT: movl $8176, %eax # imm = 0x1FF0 13954; AVX512BW-FCP-NEXT: kmovd %eax, %k1 13955; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k1} 13956; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 13957; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm16 {%k1} 13958; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19 13959; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13] 13960; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero 13961; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 13962; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 13963; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k3} 13964; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm16 {%k6} 13965; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero 13966; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16 13967; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14] 13968; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 13969; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 13970; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k3} 13971; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm19 {%k4} 13972; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm16 {%k1} 13973; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm20 {%k6} 13974; AVX512BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm3 {%k6} 13975; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm10 {%k1} 13976; AVX512BW-FCP-NEXT: vmovdqu16 %ymm11, %ymm9 {%k4} 13977; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u] 13978; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 13979; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u] 13980; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 13981; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 13982; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13] 13983; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero 13984; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm21, %xmm11 13985; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 13986; AVX512BW-FCP-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload 13987; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm10 {%k3} 13988; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u] 13989; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm20 13990; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[1,8,15],zero,zero,xmm20[4,11,u,u,u,u,u] 13991; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm20, %xmm11 13992; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 13993; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero 13994; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14] 13995; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm21, %xmm20 13996; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 13997; AVX512BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm11 {%k3} 13998; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm9, %xmm20 13999; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u] 14000; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u] 14001; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm9, %xmm9 14002; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 14003; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero 14004; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15] 14005; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17 14006; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 14007; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm9 {%k3} 14008; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] 14009; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm17, %zmm17 14010; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] 14011; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm18, %zmm18 14012; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] 14013; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm20, %zmm20 14014; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm2 14015; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] 14016; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u] 14017; AVX512BW-FCP-NEXT: vporq %xmm2, %xmm19, %xmm2 14018; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 14019; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [1,2,4,6,0,0,0,0] 14020; AVX512BW-FCP-NEXT: vmovdqa64 224(%rdi), %ymm20 14021; AVX512BW-FCP-NEXT: vpermd %ymm20, %ymm19, %ymm19 14022; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] 14023; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm19, %zmm10, %zmm10 14024; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm2 {%k5} 14025; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm10 14026; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u] 14027; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u] 14028; AVX512BW-FCP-NEXT: vporq %xmm10, %xmm16, %xmm10 14029; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 14030; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,4,6,0,0,0,0] 14031; AVX512BW-FCP-NEXT: vpermd %ymm20, %ymm16, %ymm16 14032; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] 14033; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm16, %zmm11, %zmm11 14034; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm10 {%k5} 14035; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] 14036; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 14037; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] 14038; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3 14039; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 14040; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,5,6,0,0,0,0] 14041; AVX512BW-FCP-NEXT: vpermd %ymm20, %ymm11, %ymm11 14042; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] 14043; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm9, %zmm9 14044; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm3 {%k5} 14045; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] 14046; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm9, %zmm0 14047; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm5 {%k1} 14048; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u] 14049; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 14050; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u] 14051; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 14052; AVX512BW-FCP-NEXT: movl $4186112, %eax # imm = 0x3FE000 14053; AVX512BW-FCP-NEXT: kmovd %eax, %k1 14054; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] 14055; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm0 14056; AVX512BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 14057; AVX512BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm4 {%k2} 14058; AVX512BW-FCP-NEXT: kmovd %eax, %k1 14059; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} 14060; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0 14061; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} 14062; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm0 14063; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] 14064; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero 14065; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm4, %xmm0 14066; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 14067; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7],ymm5[8,9,10],ymm0[11,12,13,14,15] 14068; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] 14069; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 14070; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} 14071; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 14072; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi 14073; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rsi) 14074; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) 14075; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx) 14076; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%r8) 14077; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9) 14078; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdi) 14079; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) 14080; AVX512BW-FCP-NEXT: vzeroupper 14081; AVX512BW-FCP-NEXT: retq 14082; 14083; AVX512DQ-BW-LABEL: load_i8_stride7_vf64: 14084; AVX512DQ-BW: # %bb.0: 14085; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm25 14086; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0] 14087; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm0, %zmm18 14088; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15] 14089; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm0, %zmm24 14090; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31] 14091; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm0, %zmm10 14092; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31] 14093; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm0, %zmm0 14094; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm9 14095; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm1 14096; AVX512DQ-BW-NEXT: movw $-28382, %ax # imm = 0x9122 14097; AVX512DQ-BW-NEXT: kmovd %eax, %k1 14098; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm9, %ymm3 {%k1} 14099; AVX512DQ-BW-NEXT: kmovq %k1, %k2 14100; AVX512DQ-BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 14101; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm4 14102; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] 14103; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] 14104; AVX512DQ-BW-NEXT: vporq %xmm4, %xmm3, %xmm16 14105; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] 14106; AVX512DQ-BW-NEXT: movw $992, %ax # imm = 0x3E0 14107; AVX512DQ-BW-NEXT: kmovd %eax, %k1 14108; AVX512DQ-BW-NEXT: vmovdqu16 %ymm0, %ymm16 {%k1} 14109; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm11 14110; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm6 14111; AVX512DQ-BW-NEXT: movw $8772, %ax # imm = 0x2244 14112; AVX512DQ-BW-NEXT: kmovd %eax, %k6 14113; AVX512DQ-BW-NEXT: vpblendmw %ymm11, %ymm6, %ymm0 {%k6} 14114; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm3 14115; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u] 14116; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u] 14117; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm0, %xmm0 14118; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 14119; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %xmm7 14120; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] 14121; AVX512DQ-BW-NEXT: vpshufb %xmm21, %xmm7, %xmm3 14122; AVX512DQ-BW-NEXT: vmovdqa 208(%rdi), %xmm8 14123; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] 14124; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] 14125; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 14126; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] 14127; AVX512DQ-BW-NEXT: vmovdqa64 240(%rdi), %xmm26 14128; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u] 14129; AVX512DQ-BW-NEXT: vmovdqa 224(%rdi), %xmm4 14130; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] 14131; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm12, %xmm5 14132; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm0 14133; AVX512DQ-BW-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 14134; AVX512DQ-BW-NEXT: kmovq %rax, %k5 14135; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k5} 14136; AVX512DQ-BW-NEXT: vmovdqa 288(%rdi), %ymm13 14137; AVX512DQ-BW-NEXT: vmovdqa 256(%rdi), %ymm12 14138; AVX512DQ-BW-NEXT: movw $9288, %ax # imm = 0x2448 14139; AVX512DQ-BW-NEXT: kmovd %eax, %k3 14140; AVX512DQ-BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k3} 14141; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u] 14142; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm0 14143; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u] 14144; AVX512DQ-BW-NEXT: vporq %xmm5, %xmm0, %xmm19 14145; AVX512DQ-BW-NEXT: vmovdqa64 352(%rdi), %ymm17 14146; AVX512DQ-BW-NEXT: vmovdqa 320(%rdi), %ymm0 14147; AVX512DQ-BW-NEXT: vpblendmw %ymm17, %ymm0, %ymm5 {%k6} 14148; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm15 = ymm5[2,3,0,1] 14149; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4,5],ymm15[6],ymm5[7,8,9],ymm15[10],ymm5[11,12,13],ymm15[14],ymm5[15] 14150; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] 14151; AVX512DQ-BW-NEXT: movw $3968, %ax # imm = 0xF80 14152; AVX512DQ-BW-NEXT: kmovd %eax, %k7 14153; AVX512DQ-BW-NEXT: vmovdqu16 %ymm5, %ymm19 {%k7} 14154; AVX512DQ-BW-NEXT: vmovdqa 416(%rdi), %ymm15 14155; AVX512DQ-BW-NEXT: vmovdqa 384(%rdi), %ymm5 14156; AVX512DQ-BW-NEXT: movw $4644, %ax # imm = 0x1224 14157; AVX512DQ-BW-NEXT: kmovd %eax, %k4 14158; AVX512DQ-BW-NEXT: vpblendmw %ymm15, %ymm5, %ymm20 {%k4} 14159; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm20, %xmm22 14160; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] 14161; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero 14162; AVX512DQ-BW-NEXT: vporq %xmm22, %xmm20, %xmm20 14163; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 14164; AVX512DQ-BW-NEXT: movl $-8388608, %eax # imm = 0xFF800000 14165; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm9, %ymm22 {%k4} 14166; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm22, %xmm23 14167; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] 14168; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[1,8,15],zero,zero,xmm22[4,11],zero,zero,xmm22[u,u,u,u,u,u,u] 14169; AVX512DQ-BW-NEXT: vporq %xmm23, %xmm22, %xmm22 14170; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] 14171; AVX512DQ-BW-NEXT: movl $511, %edi # imm = 0x1FF 14172; AVX512DQ-BW-NEXT: kmovd %edi, %k1 14173; AVX512DQ-BW-NEXT: vmovdqu8 %ymm22, %ymm10 {%k1} 14174; AVX512DQ-BW-NEXT: vpblendmw %ymm11, %ymm6, %ymm22 {%k3} 14175; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm23 = xmm22[u,u,u,6,13],zero,zero,xmm22[2,9],zero,zero,zero,xmm22[u,u,u,u] 14176; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm22, %xmm22 14177; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u],zero,zero,xmm22[4,11],zero,zero,xmm22[0,7,14,u,u,u,u] 14178; AVX512DQ-BW-NEXT: vporq %xmm23, %xmm22, %xmm22 14179; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm14 14180; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] 14181; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm23 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] 14182; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm22 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3] 14183; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm2 14184; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm2[7] 14185; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u] 14186; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] 14187; AVX512DQ-BW-NEXT: vporq %xmm14, %xmm22, %xmm14 14188; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm14, %zmm2, %zmm2 14189; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm10 {%k5} 14190; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm9, %ymm2 {%k6} 14191; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] 14192; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm2 14193; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] 14194; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm14, %xmm2 14195; AVX512DQ-BW-NEXT: movl $261632, %edi # imm = 0x3FE00 14196; AVX512DQ-BW-NEXT: kmovd %edi, %k5 14197; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 14198; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm14 {%k2} 14199; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm14, %xmm22 14200; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[1,8,15,u,u,u,u] 14201; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] 14202; AVX512DQ-BW-NEXT: vporq %xmm22, %xmm14, %xmm14 14203; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 14204; AVX512DQ-BW-NEXT: vpshufb %xmm21, %xmm8, %xmm21 14205; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] 14206; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3] 14207; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm3 14208; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,6],ymm3[7] 14209; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] 14210; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u] 14211; AVX512DQ-BW-NEXT: vporq %xmm14, %xmm21, %xmm14 14212; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm14, %zmm3, %zmm23 14213; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm23 {%k1} 14214; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm9, %ymm2 {%k3} 14215; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] 14216; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm2 14217; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] 14218; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 14219; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 14220; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k4} 14221; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm3, %xmm18 14222; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u] 14223; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u] 14224; AVX512DQ-BW-NEXT: vporq %xmm18, %xmm3, %xmm3 14225; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 14226; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12] 14227; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero 14228; AVX512DQ-BW-NEXT: vporq %xmm18, %xmm21, %xmm18 14229; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 14230; AVX512DQ-BW-NEXT: movl $-134217728, %edi # imm = 0xF8000000 14231; AVX512DQ-BW-NEXT: kmovd %edi, %k2 14232; AVX512DQ-BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 14233; AVX512DQ-BW-NEXT: vmovdqu8 %ymm18, %ymm3 {%k2} 14234; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] 14235; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u] 14236; AVX512DQ-BW-NEXT: vporq %xmm18, %xmm21, %xmm18 14237; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm18, %zmm3, %zmm18 14238; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm18 {%k1} 14239; AVX512DQ-BW-NEXT: kmovd %eax, %k2 14240; AVX512DQ-BW-NEXT: vmovdqu8 %ymm20, %ymm19 {%k2} 14241; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 14242; AVX512DQ-BW-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 14243; AVX512DQ-BW-NEXT: kmovq %rax, %k1 14244; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm16 {%k1} 14245; AVX512DQ-BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k4} 14246; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 14247; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u] 14248; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u] 14249; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 14250; AVX512DQ-BW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k3} 14251; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] 14252; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7,8,9,10],ymm14[11],ymm3[12,13],ymm14[14],ymm3[15] 14253; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] 14254; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} 14255; AVX512DQ-BW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k6} 14256; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero 14257; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm3 14258; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10] 14259; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm14, %xmm3 14260; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 14261; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} 14262; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 14263; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm10 {%k1} 14264; AVX512DQ-BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k6} 14265; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 14266; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u] 14267; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u] 14268; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 14269; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k4} 14270; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] 14271; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2],ymm14[3],ymm3[4,5,6],ymm14[7,8],ymm3[9,10],ymm14[11],ymm3[12,13,14],ymm14[15] 14272; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] 14273; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} 14274; AVX512DQ-BW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k3} 14275; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero 14276; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm3 14277; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] 14278; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm14, %xmm3 14279; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 14280; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} 14281; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 14282; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm23 {%k1} 14283; AVX512DQ-BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k3} 14284; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u] 14285; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm2 14286; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u] 14287; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 14288; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k6} 14289; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] 14290; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2,3],ymm14[4],ymm3[5,6],ymm14[7,8],ymm3[9,10,11],ymm14[12],ymm3[13,14],ymm14[15] 14291; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] 14292; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} 14293; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm15, %ymm3 {%k4} 14294; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm14 14295; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12] 14296; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero 14297; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm14, %xmm3 14298; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 14299; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} 14300; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 14301; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm18 {%k1} 14302; AVX512DQ-BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 14303; AVX512DQ-BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k1} 14304; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 14305; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u] 14306; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u] 14307; AVX512DQ-BW-NEXT: vporq %xmm3, %xmm2, %xmm19 14308; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm17, %ymm2 {%k3} 14309; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] 14310; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] 14311; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] 14312; AVX512DQ-BW-NEXT: vmovdqu16 %ymm2, %ymm19 {%k7} 14313; AVX512DQ-BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k4} 14314; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 14315; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u] 14316; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u] 14317; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 14318; AVX512DQ-BW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k1} 14319; AVX512DQ-BW-NEXT: kmovq %k1, %k7 14320; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] 14321; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7,8],ymm14[9],ymm3[10,11,12],ymm14[13],ymm3[14,15] 14322; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm20 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] 14323; AVX512DQ-BW-NEXT: movl $8176, %eax # imm = 0x1FF0 14324; AVX512DQ-BW-NEXT: kmovd %eax, %k1 14325; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k1} 14326; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k6} 14327; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 14328; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13] 14329; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero 14330; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 14331; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 14332; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm19 {%k2} 14333; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k3} 14334; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero 14335; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm2 14336; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14] 14337; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 14338; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 14339; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k2} 14340; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm22 {%k6} 14341; AVX512DQ-BW-NEXT: vpblendmw %ymm9, %ymm1, %ymm21 {%k6} 14342; AVX512DQ-BW-NEXT: vmovdqu16 %ymm17, %ymm0 {%k4} 14343; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] 14344; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] 14345; AVX512DQ-BW-NEXT: vmovdqu16 %ymm13, %ymm12 {%k6} 14346; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u] 14347; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm12, %xmm3 14348; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u] 14349; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm3, %xmm0 14350; AVX512DQ-BW-NEXT: movl $4186112, %eax # imm = 0x3FE000 14351; AVX512DQ-BW-NEXT: kmovd %eax, %k1 14352; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] 14353; AVX512DQ-BW-NEXT: vmovdqu16 %ymm15, %ymm5 {%k7} 14354; AVX512DQ-BW-NEXT: vpblendmw %ymm9, %ymm1, %ymm2 {%k4} 14355; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k3} 14356; AVX512DQ-BW-NEXT: vmovdqu16 %ymm9, %ymm1 {%k3} 14357; AVX512DQ-BW-NEXT: vmovdqu16 %ymm11, %ymm6 {%k4} 14358; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm22[u,u,2,9],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[u,u,u,u,u] 14359; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm22, %xmm11 14360; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u] 14361; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm11, %xmm9 14362; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 14363; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13] 14364; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero 14365; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm12, %xmm11 14366; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 14367; AVX512DQ-BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload 14368; AVX512DQ-BW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k1} 14369; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u] 14370; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm3 14371; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u] 14372; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm11, %xmm3 14373; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 14374; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero 14375; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14] 14376; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm12, %xmm11 14377; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 14378; AVX512DQ-BW-NEXT: vmovdqu8 %ymm11, %ymm3 {%k1} 14379; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm11 14380; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u] 14381; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] 14382; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm11, %xmm6 14383; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 14384; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero 14385; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15] 14386; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7 14387; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 14388; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1} 14389; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31] 14390; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm7, %zmm7 14391; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31] 14392; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm8, %zmm8 14393; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0] 14394; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm11, %zmm11 14395; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm12 14396; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] 14397; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] 14398; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm12, %xmm2 14399; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 14400; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] 14401; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm4, %xmm12 14402; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 14403; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] 14404; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm9 14405; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm2 {%k5} 14406; AVX512DQ-BW-NEXT: movw $-512, %ax # imm = 0xFE00 14407; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm21, %xmm9 14408; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] 14409; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm21[5,12],zero,zero,xmm21[1,8,15],zero,zero,xmm21[u,u,u,u,u,u,u] 14410; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm12, %xmm9 14411; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 14412; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 14413; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 14414; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] 14415; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 14416; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm9 {%k5} 14417; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] 14418; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm1 14419; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] 14420; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm1, %xmm1 14421; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 14422; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm26, %xmm3 14423; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 14424; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 14425; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 14426; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm1 {%k5} 14427; AVX512DQ-BW-NEXT: kmovd %eax, %k1 14428; AVX512DQ-BW-NEXT: vinserti32x8 $1, %ymm19, %zmm0, %zmm2 {%k1} 14429; AVX512DQ-BW-NEXT: vinserti32x8 $1, %ymm20, %zmm0, %zmm9 {%k1} 14430; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm3 14431; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15] 14432; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero 14433; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 14434; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 14435; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] 14436; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 14437; AVX512DQ-BW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} 14438; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 14439; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rdi 14440; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%rsi) 14441; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rdx) 14442; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, (%rcx) 14443; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, (%r8) 14444; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%r9) 14445; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rdi) 14446; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax) 14447; AVX512DQ-BW-NEXT: vzeroupper 14448; AVX512DQ-BW-NEXT: retq 14449; 14450; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf64: 14451; AVX512DQ-BW-FCP: # %bb.0: 14452; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 14453; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 14454; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] 14455; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm24 14456; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] 14457; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm13 14458; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] 14459; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm25 14460; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] 14461; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm12 14462; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] 14463; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm16 14464; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] 14465; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm8 14466; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] 14467; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm5 14468; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] 14469; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm4 14470; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm10 14471; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 14472; AVX512DQ-BW-FCP-NEXT: movw $-28382, %ax # imm = 0x9122 14473; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 14474; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm1 {%k1} 14475; AVX512DQ-BW-FCP-NEXT: kmovq %k1, %k2 14476; AVX512DQ-BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 14477; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 14478; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u,u,u] 14479; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] 14480; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1 14481; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] 14482; AVX512DQ-BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0 14483; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 14484; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm1 {%k1} 14485; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 14486; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm9 14487; AVX512DQ-BW-FCP-NEXT: movw $8772, %ax # imm = 0x2244 14488; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 14489; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm4 {%k1} 14490; AVX512DQ-BW-FCP-NEXT: kmovq %k1, %k3 14491; AVX512DQ-BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 14492; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 14493; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13,u,u,u,u] 14494; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u] 14495; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4 14496; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 14497; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,1,2,4,6] 14498; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17 14499; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm6, %ymm6 14500; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] 14501; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] 14502; AVX512DQ-BW-FCP-NEXT: vmovdqa64 240(%rdi), %xmm19 14503; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u] 14504; AVX512DQ-BW-FCP-NEXT: vmovdqa64 224(%rdi), %xmm20 14505; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] 14506; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 14507; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm4, %zmm4 14508; AVX512DQ-BW-FCP-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 14509; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k5 14510; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm1 {%k5} 14511; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm6 14512; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm4 14513; AVX512DQ-BW-FCP-NEXT: movw $9288, %ax # imm = 0x2448 14514; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 14515; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm4, %ymm7 {%k6} 14516; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u] 14517; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 14518; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u] 14519; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm7, %xmm21 14520; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] 14521; AVX512DQ-BW-FCP-NEXT: movw $3968, %ax # imm = 0xF80 14522; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k7 14523; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm21 {%k7} 14524; AVX512DQ-BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm7 14525; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm5 14526; AVX512DQ-BW-FCP-NEXT: movw $4644, %ax # imm = 0x1224 14527; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4 14528; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm18 {%k4} 14529; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm22 14530; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] 14531; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero 14532; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm18, %xmm18 14533; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm22 14534; AVX512DQ-BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000 14535; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm18 {%k4} 14536; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm23 14537; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] 14538; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u,u,u] 14539; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18 14540; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] 14541; AVX512DQ-BW-FCP-NEXT: movl $511, %r10d # imm = 0x1FF 14542; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 14543; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm8 {%k1} 14544; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm18 {%k6} 14545; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u] 14546; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18 14547; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14,u,u,u,u] 14548; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18 14549; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm14 14550; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,0,0,0,1,3,4,6] 14551; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm18, %ymm18 14552; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] 14553; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] 14554; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u] 14555; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] 14556; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm18, %xmm15 14557; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14 14558; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k5} 14559; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k3} 14560; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u] 14561; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 14562; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u] 14563; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 14564; AVX512DQ-BW-FCP-NEXT: movl $261632, %r10d # imm = 0x3FE00 14565; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k5 14566; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 14567; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm12 {%k2} 14568; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15 14569; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u] 14570; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] 14571; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm12, %xmm12 14572; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 14573; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,5,6] 14574; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm15, %ymm15 14575; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] 14576; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7] 14577; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] 14578; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u] 14579; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15 14580; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 14581; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1} 14582; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k6} 14583; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] 14584; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 14585; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] 14586; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 14587; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 14588; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm13 {%k4} 14589; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 14590; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u] 14591; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u] 14592; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 14593; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 14594; AVX512DQ-BW-FCP-NEXT: vmovdqa64 208(%rdi), %xmm17 14595; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[5,12] 14596; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm18 14597; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero 14598; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm23, %xmm15 14599; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 14600; AVX512DQ-BW-FCP-NEXT: movl $-134217728, %r10d # imm = 0xF8000000 14601; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 14602; AVX512DQ-BW-FCP-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 14603; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm13 {%k2} 14604; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] 14605; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm19[1,8,15,u,u,u,u,u,u,u,u,u,u,u] 14606; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm19, %xmm15 14607; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm13, %zmm13 14608; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm13 {%k1} 14609; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 14610; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3} 14611; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm14 14612; AVX512DQ-BW-FCP-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 14613; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 14614; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm1 {%k2} 14615; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm6, %ymm14 {%k4} 14616; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 14617; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u] 14618; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u] 14619; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 14620; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] 14621; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} 14622; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 14623; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm15 {%k1} 14624; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero 14625; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 14626; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10] 14627; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 14628; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 14629; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} 14630; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 14631; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k2} 14632; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm6, %ymm14 {%k1} 14633; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 14634; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u] 14635; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u] 14636; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 14637; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] 14638; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} 14639; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm15 {%k6} 14640; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero 14641; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 14642; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11] 14643; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 14644; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 14645; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} 14646; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 14647; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm12 {%k2} 14648; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm6, %ymm14 {%k6} 14649; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u] 14650; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 14651; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u] 14652; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 14653; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] 14654; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} 14655; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm15 {%k4} 14656; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 14657; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12] 14658; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero 14659; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 14660; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 14661; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} 14662; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 14663; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm13 {%k2} 14664; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] 14665; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm15 14666; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] 14667; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm16 14668; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 14669; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm4, %ymm14 {%k1} 14670; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm19 14671; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u] 14672; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u] 14673; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm14, %xmm14 14674; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] 14675; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm14 {%k7} 14676; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm4, %ymm16 {%k4} 14677; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19 14678; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u] 14679; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u] 14680; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 14681; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] 14682; AVX512DQ-BW-FCP-NEXT: movl $8176, %eax # imm = 0x1FF0 14683; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 14684; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k1} 14685; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 14686; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm16 {%k1} 14687; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19 14688; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13] 14689; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero 14690; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 14691; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 14692; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k3} 14693; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm16 {%k6} 14694; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero 14695; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16 14696; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14] 14697; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 14698; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 14699; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k3} 14700; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm19 {%k4} 14701; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm16 {%k1} 14702; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm20 {%k6} 14703; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm3 {%k6} 14704; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm10 {%k1} 14705; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm11, %ymm9 {%k4} 14706; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u] 14707; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 14708; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u] 14709; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 14710; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 14711; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13] 14712; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero 14713; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm21, %xmm11 14714; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 14715; AVX512DQ-BW-FCP-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload 14716; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm10 {%k2} 14717; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u] 14718; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm20 14719; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[1,8,15],zero,zero,xmm20[4,11,u,u,u,u,u] 14720; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm20, %xmm11 14721; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 14722; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero 14723; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14] 14724; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm21, %xmm20 14725; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 14726; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm11 {%k2} 14727; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm9, %xmm20 14728; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u] 14729; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u] 14730; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm9, %xmm9 14731; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 14732; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero 14733; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15] 14734; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17 14735; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 14736; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm9 {%k2} 14737; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] 14738; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm17, %zmm17 14739; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] 14740; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm18, %zmm18 14741; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] 14742; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm20, %zmm20 14743; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm2 14744; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] 14745; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u] 14746; AVX512DQ-BW-FCP-NEXT: vporq %xmm2, %xmm19, %xmm2 14747; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 14748; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [1,2,4,6,0,0,0,0] 14749; AVX512DQ-BW-FCP-NEXT: vmovdqa64 224(%rdi), %ymm20 14750; AVX512DQ-BW-FCP-NEXT: vpermd %ymm20, %ymm19, %ymm19 14751; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] 14752; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm19, %zmm10, %zmm10 14753; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm2 {%k5} 14754; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm10 14755; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u] 14756; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u] 14757; AVX512DQ-BW-FCP-NEXT: vporq %xmm10, %xmm16, %xmm10 14758; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 14759; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,4,6,0,0,0,0] 14760; AVX512DQ-BW-FCP-NEXT: vpermd %ymm20, %ymm16, %ymm16 14761; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] 14762; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm16, %zmm11, %zmm11 14763; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm10 {%k5} 14764; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] 14765; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 14766; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] 14767; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3 14768; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 14769; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,5,6,0,0,0,0] 14770; AVX512DQ-BW-FCP-NEXT: vpermd %ymm20, %ymm11, %ymm11 14771; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] 14772; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm9, %zmm9 14773; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm3 {%k5} 14774; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] 14775; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm9, %zmm0 14776; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm4 {%k1} 14777; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u] 14778; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 14779; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u] 14780; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4 14781; AVX512DQ-BW-FCP-NEXT: movl $4186112, %eax # imm = 0x3FE000 14782; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 14783; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] 14784; AVX512DQ-BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 14785; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload 14786; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm5 {%k1} 14787; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 14788; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm14, %zmm0, %zmm2 {%k1} 14789; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm15, %zmm0, %zmm10 {%k1} 14790; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm0 14791; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] 14792; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero 14793; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm5, %xmm0 14794; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 14795; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] 14796; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] 14797; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1} 14798; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 14799; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi 14800; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rsi) 14801; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) 14802; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx) 14803; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%r8) 14804; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9) 14805; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdi) 14806; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) 14807; AVX512DQ-BW-FCP-NEXT: vzeroupper 14808; AVX512DQ-BW-FCP-NEXT: retq 14809 %wide.vec = load <448 x i8>, ptr %in.vec, align 64 14810 %strided.vec0 = shufflevector <448 x i8> %wide.vec, <448 x i8> poison, <64 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105, i32 112, i32 119, i32 126, i32 133, i32 140, i32 147, i32 154, i32 161, i32 168, i32 175, i32 182, i32 189, i32 196, i32 203, i32 210, i32 217, i32 224, i32 231, i32 238, i32 245, i32 252, i32 259, i32 266, i32 273, i32 280, i32 287, i32 294, i32 301, i32 308, i32 315, i32 322, i32 329, i32 336, i32 343, i32 350, i32 357, i32 364, i32 371, i32 378, i32 385, i32 392, i32 399, i32 406, i32 413, i32 420, i32 427, i32 434, i32 441> 14811 %strided.vec1 = shufflevector <448 x i8> %wide.vec, <448 x i8> poison, <64 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106, i32 113, i32 120, i32 127, i32 134, i32 141, i32 148, i32 155, i32 162, i32 169, i32 176, i32 183, i32 190, i32 197, i32 204, i32 211, i32 218, i32 225, i32 232, i32 239, i32 246, i32 253, i32 260, i32 267, i32 274, i32 281, i32 288, i32 295, i32 302, i32 309, i32 316, i32 323, i32 330, i32 337, i32 344, i32 351, i32 358, i32 365, i32 372, i32 379, i32 386, i32 393, i32 400, i32 407, i32 414, i32 421, i32 428, i32 435, i32 442> 14812 %strided.vec2 = shufflevector <448 x i8> %wide.vec, <448 x i8> poison, <64 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107, i32 114, i32 121, i32 128, i32 135, i32 142, i32 149, i32 156, i32 163, i32 170, i32 177, i32 184, i32 191, i32 198, i32 205, i32 212, i32 219, i32 226, i32 233, i32 240, i32 247, i32 254, i32 261, i32 268, i32 275, i32 282, i32 289, i32 296, i32 303, i32 310, i32 317, i32 324, i32 331, i32 338, i32 345, i32 352, i32 359, i32 366, i32 373, i32 380, i32 387, i32 394, i32 401, i32 408, i32 415, i32 422, i32 429, i32 436, i32 443> 14813 %strided.vec3 = shufflevector <448 x i8> %wide.vec, <448 x i8> poison, <64 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108, i32 115, i32 122, i32 129, i32 136, i32 143, i32 150, i32 157, i32 164, i32 171, i32 178, i32 185, i32 192, i32 199, i32 206, i32 213, i32 220, i32 227, i32 234, i32 241, i32 248, i32 255, i32 262, i32 269, i32 276, i32 283, i32 290, i32 297, i32 304, i32 311, i32 318, i32 325, i32 332, i32 339, i32 346, i32 353, i32 360, i32 367, i32 374, i32 381, i32 388, i32 395, i32 402, i32 409, i32 416, i32 423, i32 430, i32 437, i32 444> 14814 %strided.vec4 = shufflevector <448 x i8> %wide.vec, <448 x i8> poison, <64 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109, i32 116, i32 123, i32 130, i32 137, i32 144, i32 151, i32 158, i32 165, i32 172, i32 179, i32 186, i32 193, i32 200, i32 207, i32 214, i32 221, i32 228, i32 235, i32 242, i32 249, i32 256, i32 263, i32 270, i32 277, i32 284, i32 291, i32 298, i32 305, i32 312, i32 319, i32 326, i32 333, i32 340, i32 347, i32 354, i32 361, i32 368, i32 375, i32 382, i32 389, i32 396, i32 403, i32 410, i32 417, i32 424, i32 431, i32 438, i32 445> 14815 %strided.vec5 = shufflevector <448 x i8> %wide.vec, <448 x i8> poison, <64 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110, i32 117, i32 124, i32 131, i32 138, i32 145, i32 152, i32 159, i32 166, i32 173, i32 180, i32 187, i32 194, i32 201, i32 208, i32 215, i32 222, i32 229, i32 236, i32 243, i32 250, i32 257, i32 264, i32 271, i32 278, i32 285, i32 292, i32 299, i32 306, i32 313, i32 320, i32 327, i32 334, i32 341, i32 348, i32 355, i32 362, i32 369, i32 376, i32 383, i32 390, i32 397, i32 404, i32 411, i32 418, i32 425, i32 432, i32 439, i32 446> 14816 %strided.vec6 = shufflevector <448 x i8> %wide.vec, <448 x i8> poison, <64 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111, i32 118, i32 125, i32 132, i32 139, i32 146, i32 153, i32 160, i32 167, i32 174, i32 181, i32 188, i32 195, i32 202, i32 209, i32 216, i32 223, i32 230, i32 237, i32 244, i32 251, i32 258, i32 265, i32 272, i32 279, i32 286, i32 293, i32 300, i32 307, i32 314, i32 321, i32 328, i32 335, i32 342, i32 349, i32 356, i32 363, i32 370, i32 377, i32 384, i32 391, i32 398, i32 405, i32 412, i32 419, i32 426, i32 433, i32 440, i32 447> 14817 store <64 x i8> %strided.vec0, ptr %out.vec0, align 64 14818 store <64 x i8> %strided.vec1, ptr %out.vec1, align 64 14819 store <64 x i8> %strided.vec2, ptr %out.vec2, align 64 14820 store <64 x i8> %strided.vec3, ptr %out.vec3, align 64 14821 store <64 x i8> %strided.vec4, ptr %out.vec4, align 64 14822 store <64 x i8> %strided.vec5, ptr %out.vec5, align 64 14823 store <64 x i8> %strided.vec6, ptr %out.vec6, align 64 14824 ret void 14825} 14826