1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ 10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP 11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW 12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP 13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW 14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP 15 16; These patterns are produced by LoopVectorizer for interleaved loads. 17 18define void @load_i8_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { 19; SSE-LABEL: load_i8_stride6_vf2: 20; SSE: # %bb.0: 21; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 22; SSE-NEXT: movdqa (%rdi), %xmm1 23; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 24; SSE-NEXT: pand %xmm1, %xmm3 25; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,3,2,3,4,5,6,7] 26; SSE-NEXT: packuswb %xmm2, %xmm2 27; SSE-NEXT: pxor %xmm4, %xmm4 28; SSE-NEXT: movdqa %xmm1, %xmm0 29; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 30; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,3,2,3] 31; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] 32; SSE-NEXT: packuswb %xmm5, %xmm5 33; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,2,2,3] 34; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,2,3,4,5,6,7] 35; SSE-NEXT: packuswb %xmm6, %xmm6 36; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] 37; SSE-NEXT: movdqa %xmm0, %xmm4 38; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 39; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] 40; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] 41; SSE-NEXT: packuswb %xmm4, %xmm4 42; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] 43; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] 44; SSE-NEXT: packuswb %xmm3, %xmm3 45; SSE-NEXT: psrlq $48, %xmm1 46; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 47; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 48; SSE-NEXT: packuswb %xmm0, %xmm0 49; SSE-NEXT: movd %xmm2, %edi 50; SSE-NEXT: movw %di, (%rsi) 51; SSE-NEXT: movd %xmm5, %esi 52; SSE-NEXT: movw %si, (%rdx) 53; SSE-NEXT: movd %xmm6, %edx 54; SSE-NEXT: movw %dx, (%rcx) 55; SSE-NEXT: movd %xmm4, %ecx 56; SSE-NEXT: movw %cx, (%r8) 57; SSE-NEXT: movd %xmm3, %ecx 58; SSE-NEXT: movw %cx, (%r9) 59; SSE-NEXT: movd %xmm0, %ecx 60; SSE-NEXT: movw %cx, (%rax) 61; SSE-NEXT: retq 62; 63; AVX-LABEL: load_i8_stride6_vf2: 64; AVX: # %bb.0: 65; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 66; AVX-NEXT: vmovdqa (%rdi), %xmm0 67; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 68; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 69; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 70; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 71; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 72; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 73; AVX-NEXT: vpextrw $0, %xmm1, (%rsi) 74; AVX-NEXT: vpextrw $0, %xmm2, (%rdx) 75; AVX-NEXT: vpextrw $0, %xmm3, (%rcx) 76; AVX-NEXT: vpextrw $0, %xmm4, (%r8) 77; AVX-NEXT: vpextrw $0, %xmm5, (%r9) 78; AVX-NEXT: vpextrw $0, %xmm0, (%rax) 79; AVX-NEXT: retq 80; 81; AVX2-LABEL: load_i8_stride6_vf2: 82; AVX2: # %bb.0: 83; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 84; AVX2-NEXT: vmovdqa (%rdi), %xmm0 85; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 86; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 87; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 88; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 89; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 90; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 91; AVX2-NEXT: vpextrw $0, %xmm1, (%rsi) 92; AVX2-NEXT: vpextrw $0, %xmm2, (%rdx) 93; AVX2-NEXT: vpextrw $0, %xmm3, (%rcx) 94; AVX2-NEXT: vpextrw $0, %xmm4, (%r8) 95; AVX2-NEXT: vpextrw $0, %xmm5, (%r9) 96; AVX2-NEXT: vpextrw $0, %xmm0, (%rax) 97; AVX2-NEXT: retq 98; 99; AVX2-FP-LABEL: load_i8_stride6_vf2: 100; AVX2-FP: # %bb.0: 101; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 102; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 103; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 104; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 105; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 106; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 107; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 108; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 109; AVX2-FP-NEXT: vpextrw $0, %xmm1, (%rsi) 110; AVX2-FP-NEXT: vpextrw $0, %xmm2, (%rdx) 111; AVX2-FP-NEXT: vpextrw $0, %xmm3, (%rcx) 112; AVX2-FP-NEXT: vpextrw $0, %xmm4, (%r8) 113; AVX2-FP-NEXT: vpextrw $0, %xmm5, (%r9) 114; AVX2-FP-NEXT: vpextrw $0, %xmm0, (%rax) 115; AVX2-FP-NEXT: retq 116; 117; AVX2-FCP-LABEL: load_i8_stride6_vf2: 118; AVX2-FCP: # %bb.0: 119; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 120; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 121; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 122; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 123; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 124; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 125; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 126; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 127; AVX2-FCP-NEXT: vpextrw $0, %xmm1, (%rsi) 128; AVX2-FCP-NEXT: vpextrw $0, %xmm2, (%rdx) 129; AVX2-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) 130; AVX2-FCP-NEXT: vpextrw $0, %xmm4, (%r8) 131; AVX2-FCP-NEXT: vpextrw $0, %xmm5, (%r9) 132; AVX2-FCP-NEXT: vpextrw $0, %xmm0, (%rax) 133; AVX2-FCP-NEXT: retq 134; 135; AVX512-LABEL: load_i8_stride6_vf2: 136; AVX512: # %bb.0: 137; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 138; AVX512-NEXT: vmovdqa (%rdi), %xmm0 139; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 140; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 141; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 142; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 143; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 144; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 145; AVX512-NEXT: vpextrw $0, %xmm1, (%rsi) 146; AVX512-NEXT: vpextrw $0, %xmm2, (%rdx) 147; AVX512-NEXT: vpextrw $0, %xmm3, (%rcx) 148; AVX512-NEXT: vpextrw $0, %xmm4, (%r8) 149; AVX512-NEXT: vpextrw $0, %xmm5, (%r9) 150; AVX512-NEXT: vpextrw $0, %xmm0, (%rax) 151; AVX512-NEXT: retq 152; 153; AVX512-FCP-LABEL: load_i8_stride6_vf2: 154; AVX512-FCP: # %bb.0: 155; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 156; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 157; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 158; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 159; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 160; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 161; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 162; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 163; AVX512-FCP-NEXT: vpextrw $0, %xmm1, (%rsi) 164; AVX512-FCP-NEXT: vpextrw $0, %xmm2, (%rdx) 165; AVX512-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) 166; AVX512-FCP-NEXT: vpextrw $0, %xmm4, (%r8) 167; AVX512-FCP-NEXT: vpextrw $0, %xmm5, (%r9) 168; AVX512-FCP-NEXT: vpextrw $0, %xmm0, (%rax) 169; AVX512-FCP-NEXT: retq 170; 171; AVX512DQ-LABEL: load_i8_stride6_vf2: 172; AVX512DQ: # %bb.0: 173; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 174; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 175; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 176; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 177; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 178; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 179; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 180; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 181; AVX512DQ-NEXT: vpextrw $0, %xmm1, (%rsi) 182; AVX512DQ-NEXT: vpextrw $0, %xmm2, (%rdx) 183; AVX512DQ-NEXT: vpextrw $0, %xmm3, (%rcx) 184; AVX512DQ-NEXT: vpextrw $0, %xmm4, (%r8) 185; AVX512DQ-NEXT: vpextrw $0, %xmm5, (%r9) 186; AVX512DQ-NEXT: vpextrw $0, %xmm0, (%rax) 187; AVX512DQ-NEXT: retq 188; 189; AVX512DQ-FCP-LABEL: load_i8_stride6_vf2: 190; AVX512DQ-FCP: # %bb.0: 191; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 192; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 193; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 194; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 195; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 196; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 197; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 198; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 199; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm1, (%rsi) 200; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm2, (%rdx) 201; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) 202; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm4, (%r8) 203; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm5, (%r9) 204; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm0, (%rax) 205; AVX512DQ-FCP-NEXT: retq 206; 207; AVX512BW-LABEL: load_i8_stride6_vf2: 208; AVX512BW: # %bb.0: 209; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 210; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 211; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 212; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 213; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 214; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 215; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 216; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 217; AVX512BW-NEXT: vpextrw $0, %xmm1, (%rsi) 218; AVX512BW-NEXT: vpextrw $0, %xmm2, (%rdx) 219; AVX512BW-NEXT: vpextrw $0, %xmm3, (%rcx) 220; AVX512BW-NEXT: vpextrw $0, %xmm4, (%r8) 221; AVX512BW-NEXT: vpextrw $0, %xmm5, (%r9) 222; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rax) 223; AVX512BW-NEXT: retq 224; 225; AVX512BW-FCP-LABEL: load_i8_stride6_vf2: 226; AVX512BW-FCP: # %bb.0: 227; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 228; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 229; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 230; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 231; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 232; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 233; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 234; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 235; AVX512BW-FCP-NEXT: vpextrw $0, %xmm1, (%rsi) 236; AVX512BW-FCP-NEXT: vpextrw $0, %xmm2, (%rdx) 237; AVX512BW-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) 238; AVX512BW-FCP-NEXT: vpextrw $0, %xmm4, (%r8) 239; AVX512BW-FCP-NEXT: vpextrw $0, %xmm5, (%r9) 240; AVX512BW-FCP-NEXT: vpextrw $0, %xmm0, (%rax) 241; AVX512BW-FCP-NEXT: retq 242; 243; AVX512DQ-BW-LABEL: load_i8_stride6_vf2: 244; AVX512DQ-BW: # %bb.0: 245; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 246; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 247; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 248; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 249; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 250; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 251; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 252; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 253; AVX512DQ-BW-NEXT: vpextrw $0, %xmm1, (%rsi) 254; AVX512DQ-BW-NEXT: vpextrw $0, %xmm2, (%rdx) 255; AVX512DQ-BW-NEXT: vpextrw $0, %xmm3, (%rcx) 256; AVX512DQ-BW-NEXT: vpextrw $0, %xmm4, (%r8) 257; AVX512DQ-BW-NEXT: vpextrw $0, %xmm5, (%r9) 258; AVX512DQ-BW-NEXT: vpextrw $0, %xmm0, (%rax) 259; AVX512DQ-BW-NEXT: retq 260; 261; AVX512DQ-BW-FCP-LABEL: load_i8_stride6_vf2: 262; AVX512DQ-BW-FCP: # %bb.0: 263; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 264; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 265; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 266; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 267; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 268; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 269; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 270; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 271; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm1, (%rsi) 272; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm2, (%rdx) 273; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm3, (%rcx) 274; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm4, (%r8) 275; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm5, (%r9) 276; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm0, (%rax) 277; AVX512DQ-BW-FCP-NEXT: retq 278 %wide.vec = load <12 x i8>, ptr %in.vec, align 64 279 %strided.vec0 = shufflevector <12 x i8> %wide.vec, <12 x i8> poison, <2 x i32> <i32 0, i32 6> 280 %strided.vec1 = shufflevector <12 x i8> %wide.vec, <12 x i8> poison, <2 x i32> <i32 1, i32 7> 281 %strided.vec2 = shufflevector <12 x i8> %wide.vec, <12 x i8> poison, <2 x i32> <i32 2, i32 8> 282 %strided.vec3 = shufflevector <12 x i8> %wide.vec, <12 x i8> poison, <2 x i32> <i32 3, i32 9> 283 %strided.vec4 = shufflevector <12 x i8> %wide.vec, <12 x i8> poison, <2 x i32> <i32 4, i32 10> 284 %strided.vec5 = shufflevector <12 x i8> %wide.vec, <12 x i8> poison, <2 x i32> <i32 5, i32 11> 285 store <2 x i8> %strided.vec0, ptr %out.vec0, align 64 286 store <2 x i8> %strided.vec1, ptr %out.vec1, align 64 287 store <2 x i8> %strided.vec2, ptr %out.vec2, align 64 288 store <2 x i8> %strided.vec3, ptr %out.vec3, align 64 289 store <2 x i8> %strided.vec4, ptr %out.vec4, align 64 290 store <2 x i8> %strided.vec5, ptr %out.vec5, align 64 291 ret void 292} 293 294define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { 295; SSE-LABEL: load_i8_stride6_vf4: 296; SSE: # %bb.0: 297; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 298; SSE-NEXT: movdqa (%rdi), %xmm5 299; SSE-NEXT: movdqa 16(%rdi), %xmm1 300; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] 301; SSE-NEXT: movdqa %xmm5, %xmm2 302; SSE-NEXT: pand %xmm0, %xmm2 303; SSE-NEXT: pandn %xmm1, %xmm0 304; SSE-NEXT: por %xmm2, %xmm0 305; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 306; SSE-NEXT: movdqa {{.*#+}} xmm2 = [16711935,16711935,16711935,16711935] 307; SSE-NEXT: pand %xmm2, %xmm0 308; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] 309; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 310; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] 311; SSE-NEXT: packuswb %xmm0, %xmm0 312; SSE-NEXT: pxor %xmm3, %xmm3 313; SSE-NEXT: movdqa %xmm5, %xmm7 314; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] 315; SSE-NEXT: pandn %xmm1, %xmm4 316; SSE-NEXT: movdqa %xmm1, %xmm6 317; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm5[0,0] 318; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3] 319; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,0,65535,65535] 320; SSE-NEXT: pand %xmm8, %xmm1 321; SSE-NEXT: pandn %xmm5, %xmm8 322; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] 323; SSE-NEXT: movdqa %xmm5, %xmm9 324; SSE-NEXT: psrld $16, %xmm9 325; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] 326; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] 327; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] 328; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] 329; SSE-NEXT: packuswb %xmm7, %xmm7 330; SSE-NEXT: por %xmm7, %xmm4 331; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[2,1,2,3,4,5,6,7] 332; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] 333; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 334; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] 335; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7] 336; SSE-NEXT: packuswb %xmm7, %xmm7 337; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] 338; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,2],xmm5[0,3] 339; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0,1,3] 340; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] 341; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] 342; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 343; SSE-NEXT: packuswb %xmm6, %xmm6 344; SSE-NEXT: por %xmm1, %xmm8 345; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,1,2,0] 346; SSE-NEXT: pand %xmm2, %xmm1 347; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 348; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] 349; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] 350; SSE-NEXT: packuswb %xmm1, %xmm1 351; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] 352; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm8[0,0] 353; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[2,3] 354; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,1,2,3,4,5,6,7] 355; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] 356; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 357; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] 358; SSE-NEXT: packuswb %xmm2, %xmm2 359; SSE-NEXT: movd %xmm0, (%rsi) 360; SSE-NEXT: movd %xmm4, (%rdx) 361; SSE-NEXT: movd %xmm7, (%rcx) 362; SSE-NEXT: movd %xmm6, (%r8) 363; SSE-NEXT: movd %xmm1, (%r9) 364; SSE-NEXT: movd %xmm2, (%rax) 365; SSE-NEXT: retq 366; 367; AVX-LABEL: load_i8_stride6_vf4: 368; AVX: # %bb.0: 369; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 370; AVX-NEXT: vmovdqa (%rdi), %xmm0 371; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 372; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u] 373; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 374; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 375; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] 376; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 377; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 378; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u] 379; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 380; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 381; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] 382; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 383; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 384; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 385; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 386; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 387; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 388; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 389; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 390; AVX-NEXT: vmovd %xmm2, (%rsi) 391; AVX-NEXT: vmovd %xmm3, (%rdx) 392; AVX-NEXT: vmovd %xmm4, (%rcx) 393; AVX-NEXT: vmovd %xmm5, (%r8) 394; AVX-NEXT: vmovd %xmm6, (%r9) 395; AVX-NEXT: vmovd %xmm0, (%rax) 396; AVX-NEXT: retq 397; 398; AVX2-LABEL: load_i8_stride6_vf4: 399; AVX2: # %bb.0: 400; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 401; AVX2-NEXT: vmovdqa (%rdi), %xmm0 402; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 403; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u] 404; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 405; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 406; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] 407; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 408; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 409; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u] 410; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 411; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 412; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] 413; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 414; AVX2-NEXT: vpor %xmm5, %xmm6, %xmm5 415; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 416; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 417; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 418; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 419; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 420; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 421; AVX2-NEXT: vmovd %xmm2, (%rsi) 422; AVX2-NEXT: vmovd %xmm3, (%rdx) 423; AVX2-NEXT: vmovd %xmm4, (%rcx) 424; AVX2-NEXT: vmovd %xmm5, (%r8) 425; AVX2-NEXT: vmovd %xmm6, (%r9) 426; AVX2-NEXT: vmovd %xmm0, (%rax) 427; AVX2-NEXT: retq 428; 429; AVX2-FP-LABEL: load_i8_stride6_vf4: 430; AVX2-FP: # %bb.0: 431; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 432; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 433; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1 434; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u] 435; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 436; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2 437; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] 438; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 439; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3 440; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u] 441; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 442; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 443; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] 444; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 445; AVX2-FP-NEXT: vpor %xmm5, %xmm6, %xmm5 446; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 447; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 448; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 449; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 450; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 451; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 452; AVX2-FP-NEXT: vmovd %xmm2, (%rsi) 453; AVX2-FP-NEXT: vmovd %xmm3, (%rdx) 454; AVX2-FP-NEXT: vmovd %xmm4, (%rcx) 455; AVX2-FP-NEXT: vmovd %xmm5, (%r8) 456; AVX2-FP-NEXT: vmovd %xmm6, (%r9) 457; AVX2-FP-NEXT: vmovd %xmm0, (%rax) 458; AVX2-FP-NEXT: retq 459; 460; AVX2-FCP-LABEL: load_i8_stride6_vf4: 461; AVX2-FCP: # %bb.0: 462; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 463; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 464; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 465; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u] 466; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 467; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 468; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] 469; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 470; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 471; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u] 472; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 473; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 474; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] 475; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 476; AVX2-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 477; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 478; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 479; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 480; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 481; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 482; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 483; AVX2-FCP-NEXT: vmovd %xmm2, (%rsi) 484; AVX2-FCP-NEXT: vmovd %xmm3, (%rdx) 485; AVX2-FCP-NEXT: vmovd %xmm4, (%rcx) 486; AVX2-FCP-NEXT: vmovd %xmm5, (%r8) 487; AVX2-FCP-NEXT: vmovd %xmm6, (%r9) 488; AVX2-FCP-NEXT: vmovd %xmm0, (%rax) 489; AVX2-FCP-NEXT: retq 490; 491; AVX512-LABEL: load_i8_stride6_vf4: 492; AVX512: # %bb.0: 493; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 494; AVX512-NEXT: vmovdqa (%rdi), %xmm0 495; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 496; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u] 497; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 498; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 499; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] 500; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 501; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 502; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u] 503; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 504; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4 505; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] 506; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 507; AVX512-NEXT: vpor %xmm5, %xmm6, %xmm5 508; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 509; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 510; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 511; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 512; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 513; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 514; AVX512-NEXT: vmovd %xmm2, (%rsi) 515; AVX512-NEXT: vmovd %xmm3, (%rdx) 516; AVX512-NEXT: vmovd %xmm4, (%rcx) 517; AVX512-NEXT: vmovd %xmm5, (%r8) 518; AVX512-NEXT: vmovd %xmm6, (%r9) 519; AVX512-NEXT: vmovd %xmm0, (%rax) 520; AVX512-NEXT: retq 521; 522; AVX512-FCP-LABEL: load_i8_stride6_vf4: 523; AVX512-FCP: # %bb.0: 524; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 525; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 526; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 527; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u] 528; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 529; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 530; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] 531; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 532; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 533; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u] 534; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 535; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 536; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] 537; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 538; AVX512-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 539; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 540; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 541; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 542; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 543; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 544; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 545; AVX512-FCP-NEXT: vmovd %xmm2, (%rsi) 546; AVX512-FCP-NEXT: vmovd %xmm3, (%rdx) 547; AVX512-FCP-NEXT: vmovd %xmm4, (%rcx) 548; AVX512-FCP-NEXT: vmovd %xmm5, (%r8) 549; AVX512-FCP-NEXT: vmovd %xmm6, (%r9) 550; AVX512-FCP-NEXT: vmovd %xmm0, (%rax) 551; AVX512-FCP-NEXT: retq 552; 553; AVX512DQ-LABEL: load_i8_stride6_vf4: 554; AVX512DQ: # %bb.0: 555; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 556; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 557; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 558; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u] 559; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 560; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 561; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] 562; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 563; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 564; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u] 565; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 566; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4 567; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] 568; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 569; AVX512DQ-NEXT: vpor %xmm5, %xmm6, %xmm5 570; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 571; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 572; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 573; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 574; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 575; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 576; AVX512DQ-NEXT: vmovd %xmm2, (%rsi) 577; AVX512DQ-NEXT: vmovd %xmm3, (%rdx) 578; AVX512DQ-NEXT: vmovd %xmm4, (%rcx) 579; AVX512DQ-NEXT: vmovd %xmm5, (%r8) 580; AVX512DQ-NEXT: vmovd %xmm6, (%r9) 581; AVX512DQ-NEXT: vmovd %xmm0, (%rax) 582; AVX512DQ-NEXT: retq 583; 584; AVX512DQ-FCP-LABEL: load_i8_stride6_vf4: 585; AVX512DQ-FCP: # %bb.0: 586; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 587; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 588; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 589; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u] 590; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 591; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 592; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] 593; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 594; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 595; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u] 596; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 597; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 598; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] 599; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 600; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 601; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 602; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 603; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 604; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 605; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 606; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 607; AVX512DQ-FCP-NEXT: vmovd %xmm2, (%rsi) 608; AVX512DQ-FCP-NEXT: vmovd %xmm3, (%rdx) 609; AVX512DQ-FCP-NEXT: vmovd %xmm4, (%rcx) 610; AVX512DQ-FCP-NEXT: vmovd %xmm5, (%r8) 611; AVX512DQ-FCP-NEXT: vmovd %xmm6, (%r9) 612; AVX512DQ-FCP-NEXT: vmovd %xmm0, (%rax) 613; AVX512DQ-FCP-NEXT: retq 614; 615; AVX512BW-LABEL: load_i8_stride6_vf4: 616; AVX512BW: # %bb.0: 617; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 618; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 619; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 620; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u] 621; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 622; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 623; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] 624; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 625; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 626; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u] 627; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 628; AVX512BW-NEXT: vpor %xmm4, %xmm5, %xmm4 629; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] 630; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 631; AVX512BW-NEXT: vpor %xmm5, %xmm6, %xmm5 632; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 633; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 634; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 635; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 636; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 637; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 638; AVX512BW-NEXT: vmovd %xmm2, (%rsi) 639; AVX512BW-NEXT: vmovd %xmm3, (%rdx) 640; AVX512BW-NEXT: vmovd %xmm4, (%rcx) 641; AVX512BW-NEXT: vmovd %xmm5, (%r8) 642; AVX512BW-NEXT: vmovd %xmm6, (%r9) 643; AVX512BW-NEXT: vmovd %xmm0, (%rax) 644; AVX512BW-NEXT: retq 645; 646; AVX512BW-FCP-LABEL: load_i8_stride6_vf4: 647; AVX512BW-FCP: # %bb.0: 648; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 649; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 650; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 651; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u] 652; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 653; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 654; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] 655; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 656; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 657; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u] 658; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 659; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 660; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] 661; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 662; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 663; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 664; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 665; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 666; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 667; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 668; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 669; AVX512BW-FCP-NEXT: vmovd %xmm2, (%rsi) 670; AVX512BW-FCP-NEXT: vmovd %xmm3, (%rdx) 671; AVX512BW-FCP-NEXT: vmovd %xmm4, (%rcx) 672; AVX512BW-FCP-NEXT: vmovd %xmm5, (%r8) 673; AVX512BW-FCP-NEXT: vmovd %xmm6, (%r9) 674; AVX512BW-FCP-NEXT: vmovd %xmm0, (%rax) 675; AVX512BW-FCP-NEXT: retq 676; 677; AVX512DQ-BW-LABEL: load_i8_stride6_vf4: 678; AVX512DQ-BW: # %bb.0: 679; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 680; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 681; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1 682; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u] 683; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 684; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 685; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] 686; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 687; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3 688; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u] 689; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 690; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm5, %xmm4 691; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] 692; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 693; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm6, %xmm5 694; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 695; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 696; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 697; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 698; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 699; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 700; AVX512DQ-BW-NEXT: vmovd %xmm2, (%rsi) 701; AVX512DQ-BW-NEXT: vmovd %xmm3, (%rdx) 702; AVX512DQ-BW-NEXT: vmovd %xmm4, (%rcx) 703; AVX512DQ-BW-NEXT: vmovd %xmm5, (%r8) 704; AVX512DQ-BW-NEXT: vmovd %xmm6, (%r9) 705; AVX512DQ-BW-NEXT: vmovd %xmm0, (%rax) 706; AVX512DQ-BW-NEXT: retq 707; 708; AVX512DQ-BW-FCP-LABEL: load_i8_stride6_vf4: 709; AVX512DQ-BW-FCP: # %bb.0: 710; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 711; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 712; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 713; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u] 714; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 715; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 716; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u] 717; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 718; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 719; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u] 720; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 721; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 722; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] 723; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 724; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 725; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 726; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 727; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 728; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 729; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 730; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 731; AVX512DQ-BW-FCP-NEXT: vmovd %xmm2, (%rsi) 732; AVX512DQ-BW-FCP-NEXT: vmovd %xmm3, (%rdx) 733; AVX512DQ-BW-FCP-NEXT: vmovd %xmm4, (%rcx) 734; AVX512DQ-BW-FCP-NEXT: vmovd %xmm5, (%r8) 735; AVX512DQ-BW-FCP-NEXT: vmovd %xmm6, (%r9) 736; AVX512DQ-BW-FCP-NEXT: vmovd %xmm0, (%rax) 737; AVX512DQ-BW-FCP-NEXT: retq 738 %wide.vec = load <24 x i8>, ptr %in.vec, align 64 739 %strided.vec0 = shufflevector <24 x i8> %wide.vec, <24 x i8> poison, <4 x i32> <i32 0, i32 6, i32 12, i32 18> 740 %strided.vec1 = shufflevector <24 x i8> %wide.vec, <24 x i8> poison, <4 x i32> <i32 1, i32 7, i32 13, i32 19> 741 %strided.vec2 = shufflevector <24 x i8> %wide.vec, <24 x i8> poison, <4 x i32> <i32 2, i32 8, i32 14, i32 20> 742 %strided.vec3 = shufflevector <24 x i8> %wide.vec, <24 x i8> poison, <4 x i32> <i32 3, i32 9, i32 15, i32 21> 743 %strided.vec4 = shufflevector <24 x i8> %wide.vec, <24 x i8> poison, <4 x i32> <i32 4, i32 10, i32 16, i32 22> 744 %strided.vec5 = shufflevector <24 x i8> %wide.vec, <24 x i8> poison, <4 x i32> <i32 5, i32 11, i32 17, i32 23> 745 store <4 x i8> %strided.vec0, ptr %out.vec0, align 64 746 store <4 x i8> %strided.vec1, ptr %out.vec1, align 64 747 store <4 x i8> %strided.vec2, ptr %out.vec2, align 64 748 store <4 x i8> %strided.vec3, ptr %out.vec3, align 64 749 store <4 x i8> %strided.vec4, ptr %out.vec4, align 64 750 store <4 x i8> %strided.vec5, ptr %out.vec5, align 64 751 ret void 752} 753 754define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { 755; SSE-LABEL: load_i8_stride6_vf8: 756; SSE: # %bb.0: 757; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 758; SSE-NEXT: movdqa (%rdi), %xmm4 759; SSE-NEXT: movdqa 16(%rdi), %xmm3 760; SSE-NEXT: movdqa 32(%rdi), %xmm0 761; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,0,65535,65535,0] 762; SSE-NEXT: movdqa %xmm4, %xmm1 763; SSE-NEXT: pand %xmm8, %xmm1 764; SSE-NEXT: pandn %xmm3, %xmm8 765; SSE-NEXT: por %xmm1, %xmm8 766; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,1,3] 767; SSE-NEXT: movdqa {{.*#+}} xmm5 = [16711935,16711935,16711935,16711935] 768; SSE-NEXT: pand %xmm5, %xmm1 769; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 770; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] 771; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] 772; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,7,6,7] 773; SSE-NEXT: packuswb %xmm6, %xmm6 774; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] 775; SSE-NEXT: pand %xmm1, %xmm6 776; SSE-NEXT: movdqa %xmm0, %xmm7 777; SSE-NEXT: pand %xmm5, %xmm7 778; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,1,2,1] 779; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,6,5] 780; SSE-NEXT: packuswb %xmm9, %xmm9 781; SSE-NEXT: movdqa %xmm1, %xmm2 782; SSE-NEXT: pandn %xmm9, %xmm2 783; SSE-NEXT: por %xmm6, %xmm2 784; SSE-NEXT: pxor %xmm6, %xmm6 785; SSE-NEXT: movdqa %xmm8, %xmm9 786; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] 787; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,0,3] 788; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7] 789; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7] 790; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,0,0,65535,65535] 791; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] 792; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] 793; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] 794; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,0,4,5,6,7] 795; SSE-NEXT: pand %xmm10, %xmm8 796; SSE-NEXT: pandn %xmm9, %xmm10 797; SSE-NEXT: por %xmm8, %xmm10 798; SSE-NEXT: packuswb %xmm10, %xmm10 799; SSE-NEXT: pand %xmm1, %xmm10 800; SSE-NEXT: movdqa %xmm0, %xmm8 801; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] 802; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] 803; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,3,3] 804; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] 805; SSE-NEXT: packuswb %xmm9, %xmm9 806; SSE-NEXT: pandn %xmm9, %xmm1 807; SSE-NEXT: por %xmm10, %xmm1 808; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,0,65535,65535,0,65535,65535] 809; SSE-NEXT: movdqa %xmm11, %xmm9 810; SSE-NEXT: pandn %xmm3, %xmm9 811; SSE-NEXT: movdqa %xmm4, %xmm12 812; SSE-NEXT: pand %xmm11, %xmm12 813; SSE-NEXT: por %xmm9, %xmm12 814; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm12[2,1,2,3,4,5,6,7] 815; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] 816; SSE-NEXT: pand %xmm5, %xmm9 817; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3] 818; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,2,3,0,4,5,6,7] 819; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,5,5,5,5] 820; SSE-NEXT: packuswb %xmm13, %xmm13 821; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] 822; SSE-NEXT: pand %xmm9, %xmm13 823; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm7[0,3,2,3,4,5,6,7] 824; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] 825; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm10[0,1,2,3,4,4,5,6] 826; SSE-NEXT: packuswb %xmm14, %xmm14 827; SSE-NEXT: movdqa %xmm9, %xmm10 828; SSE-NEXT: pandn %xmm14, %xmm10 829; SSE-NEXT: por %xmm13, %xmm10 830; SSE-NEXT: movdqa %xmm12, %xmm13 831; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] 832; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] 833; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[3,1,2,1,4,5,6,7] 834; SSE-NEXT: movdqa {{.*#+}} xmm14 = [0,65535,65535,0,65535,65535,65535,65535] 835; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm6[8],xmm12[9],xmm6[9],xmm12[10],xmm6[10],xmm12[11],xmm6[11],xmm12[12],xmm6[12],xmm12[13],xmm6[13],xmm12[14],xmm6[14],xmm12[15],xmm6[15] 836; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] 837; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7] 838; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7] 839; SSE-NEXT: pand %xmm14, %xmm12 840; SSE-NEXT: pandn %xmm13, %xmm14 841; SSE-NEXT: por %xmm12, %xmm14 842; SSE-NEXT: packuswb %xmm14, %xmm14 843; SSE-NEXT: pand %xmm9, %xmm14 844; SSE-NEXT: movdqa %xmm8, %xmm12 845; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[3,0] 846; SSE-NEXT: movaps %xmm0, %xmm13 847; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[0,2] 848; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,7,5,6,7] 849; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,1,0,2] 850; SSE-NEXT: packuswb %xmm13, %xmm13 851; SSE-NEXT: movdqa %xmm9, %xmm12 852; SSE-NEXT: pandn %xmm13, %xmm12 853; SSE-NEXT: por %xmm14, %xmm12 854; SSE-NEXT: pand %xmm11, %xmm3 855; SSE-NEXT: pandn %xmm4, %xmm11 856; SSE-NEXT: por %xmm3, %xmm11 857; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[3,1,2,0] 858; SSE-NEXT: pand %xmm5, %xmm3 859; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] 860; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] 861; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,1,0,3,4,5,6,7] 862; SSE-NEXT: packuswb %xmm4, %xmm4 863; SSE-NEXT: pand %xmm9, %xmm4 864; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,7,6,7] 865; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,1,0,2] 866; SSE-NEXT: packuswb %xmm5, %xmm5 867; SSE-NEXT: movdqa %xmm9, %xmm3 868; SSE-NEXT: pandn %xmm5, %xmm3 869; SSE-NEXT: por %xmm4, %xmm3 870; SSE-NEXT: movdqa %xmm11, %xmm4 871; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] 872; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] 873; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] 874; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535] 875; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] 876; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,7,5,6,7] 877; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 878; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,1,2,4,5,6,7] 879; SSE-NEXT: pand %xmm5, %xmm6 880; SSE-NEXT: pandn %xmm4, %xmm5 881; SSE-NEXT: por %xmm6, %xmm5 882; SSE-NEXT: packuswb %xmm5, %xmm5 883; SSE-NEXT: pand %xmm9, %xmm5 884; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm8[0,0] 885; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[2,3] 886; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 887; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 888; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,7] 889; SSE-NEXT: packuswb %xmm0, %xmm0 890; SSE-NEXT: pandn %xmm0, %xmm9 891; SSE-NEXT: por %xmm5, %xmm9 892; SSE-NEXT: movq %xmm2, (%rsi) 893; SSE-NEXT: movq %xmm1, (%rdx) 894; SSE-NEXT: movq %xmm10, (%rcx) 895; SSE-NEXT: movq %xmm12, (%r8) 896; SSE-NEXT: movq %xmm3, (%r9) 897; SSE-NEXT: movq %xmm9, (%rax) 898; SSE-NEXT: retq 899; 900; AVX-LABEL: load_i8_stride6_vf8: 901; AVX: # %bb.0: 902; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 903; AVX-NEXT: vmovdqa (%rdi), %xmm1 904; AVX-NEXT: vmovdqa 16(%rdi), %xmm2 905; AVX-NEXT: vmovdqa 32(%rdi), %xmm0 906; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm2[2,8,14,u,u,u,u,u,u,u,u,u,u] 907; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u] 908; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 909; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 910; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] 911; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm0[4,10,u,u,u,u,u,u,u,u] 912; AVX-NEXT: vpor %xmm5, %xmm3, %xmm3 913; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[3,9,15,u,u,u,u,u,u,u,u,u,u] 914; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u] 915; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 916; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4,5,6,7] 917; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm0[5,11,u,u,u,u,u,u,u,u] 918; AVX-NEXT: vpor %xmm5, %xmm4, %xmm4 919; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u] 920; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] 921; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 922; AVX-NEXT: vmovq {{.*#+}} xmm6 = [0,1,2,3,4,128,128,128,0,0,0,0,0,0,0,0] 923; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5 924; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[0,6,12,u,u,u,u,u,u,u,u] 925; AVX-NEXT: vpor %xmm7, %xmm5, %xmm5 926; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u] 927; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] 928; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7 929; AVX-NEXT: vpshufb %xmm6, %xmm7, %xmm7 930; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm0[1,7,13,u,u,u,u,u,u,u,u] 931; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7 932; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[4,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] 933; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u] 934; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8 935; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm8 936; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] 937; AVX-NEXT: vpor %xmm9, %xmm8, %xmm8 938; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] 939; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u,u,u,u] 940; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 941; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm1 942; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] 943; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 944; AVX-NEXT: vmovq %xmm3, (%rsi) 945; AVX-NEXT: vmovq %xmm4, (%rdx) 946; AVX-NEXT: vmovq %xmm5, (%rcx) 947; AVX-NEXT: vmovq %xmm7, (%r8) 948; AVX-NEXT: vmovq %xmm8, (%r9) 949; AVX-NEXT: vmovq %xmm0, (%rax) 950; AVX-NEXT: retq 951; 952; AVX2-LABEL: load_i8_stride6_vf8: 953; AVX2: # %bb.0: 954; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 955; AVX2-NEXT: vmovdqa (%rdi), %ymm0 956; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 957; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] 958; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 959; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] 960; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] 961; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 962; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] 963; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] 964; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 965; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] 966; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 967; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] 968; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] 969; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 970; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] 971; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] 972; AVX2-NEXT: vpor %xmm5, %xmm3, %xmm3 973; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] 974; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 975; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] 976; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] 977; AVX2-NEXT: vpor %xmm5, %xmm7, %xmm5 978; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] 979; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] 980; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 981; AVX2-NEXT: vmovq %xmm4, (%rsi) 982; AVX2-NEXT: vmovq %xmm2, (%rdx) 983; AVX2-NEXT: vmovq %xmm6, (%rcx) 984; AVX2-NEXT: vmovq %xmm3, (%r8) 985; AVX2-NEXT: vmovq %xmm5, (%r9) 986; AVX2-NEXT: vmovq %xmm0, (%rax) 987; AVX2-NEXT: vzeroupper 988; AVX2-NEXT: retq 989; 990; AVX2-FP-LABEL: load_i8_stride6_vf8: 991; AVX2-FP: # %bb.0: 992; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 993; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 994; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 995; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] 996; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 997; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] 998; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] 999; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 1000; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] 1001; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] 1002; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 1003; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] 1004; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm5 1005; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] 1006; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] 1007; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 1008; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] 1009; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] 1010; AVX2-FP-NEXT: vpor %xmm5, %xmm3, %xmm3 1011; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] 1012; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 1013; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] 1014; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] 1015; AVX2-FP-NEXT: vpor %xmm5, %xmm7, %xmm5 1016; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] 1017; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] 1018; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 1019; AVX2-FP-NEXT: vmovq %xmm4, (%rsi) 1020; AVX2-FP-NEXT: vmovq %xmm2, (%rdx) 1021; AVX2-FP-NEXT: vmovq %xmm6, (%rcx) 1022; AVX2-FP-NEXT: vmovq %xmm3, (%r8) 1023; AVX2-FP-NEXT: vmovq %xmm5, (%r9) 1024; AVX2-FP-NEXT: vmovq %xmm0, (%rax) 1025; AVX2-FP-NEXT: vzeroupper 1026; AVX2-FP-NEXT: retq 1027; 1028; AVX2-FCP-LABEL: load_i8_stride6_vf8: 1029; AVX2-FCP: # %bb.0: 1030; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1031; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 1032; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 1033; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] 1034; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 1035; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] 1036; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] 1037; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 1038; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] 1039; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] 1040; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 1041; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] 1042; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 1043; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] 1044; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] 1045; AVX2-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 1046; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] 1047; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] 1048; AVX2-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 1049; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] 1050; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 1051; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] 1052; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] 1053; AVX2-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 1054; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] 1055; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] 1056; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 1057; AVX2-FCP-NEXT: vmovq %xmm4, (%rsi) 1058; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx) 1059; AVX2-FCP-NEXT: vmovq %xmm6, (%rcx) 1060; AVX2-FCP-NEXT: vmovq %xmm3, (%r8) 1061; AVX2-FCP-NEXT: vmovq %xmm5, (%r9) 1062; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) 1063; AVX2-FCP-NEXT: vzeroupper 1064; AVX2-FCP-NEXT: retq 1065; 1066; AVX512-LABEL: load_i8_stride6_vf8: 1067; AVX512: # %bb.0: 1068; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 1069; AVX512-NEXT: vmovdqa (%rdi), %ymm0 1070; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 1071; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] 1072; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 1073; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] 1074; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] 1075; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4 1076; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] 1077; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] 1078; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 1079; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] 1080; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm5 1081; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] 1082; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] 1083; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 1084; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] 1085; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] 1086; AVX512-NEXT: vpor %xmm5, %xmm3, %xmm3 1087; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] 1088; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1089; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] 1090; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] 1091; AVX512-NEXT: vpor %xmm5, %xmm7, %xmm5 1092; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] 1093; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] 1094; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 1095; AVX512-NEXT: vmovq %xmm4, (%rsi) 1096; AVX512-NEXT: vmovq %xmm2, (%rdx) 1097; AVX512-NEXT: vmovq %xmm6, (%rcx) 1098; AVX512-NEXT: vmovq %xmm3, (%r8) 1099; AVX512-NEXT: vmovq %xmm5, (%r9) 1100; AVX512-NEXT: vmovq %xmm0, (%rax) 1101; AVX512-NEXT: vzeroupper 1102; AVX512-NEXT: retq 1103; 1104; AVX512-FCP-LABEL: load_i8_stride6_vf8: 1105; AVX512-FCP: # %bb.0: 1106; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1107; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 1108; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 1109; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] 1110; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 1111; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] 1112; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] 1113; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 1114; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] 1115; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] 1116; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 1117; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] 1118; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 1119; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] 1120; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] 1121; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 1122; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] 1123; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] 1124; AVX512-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 1125; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] 1126; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 1127; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] 1128; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] 1129; AVX512-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 1130; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] 1131; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] 1132; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 1133; AVX512-FCP-NEXT: vmovq %xmm4, (%rsi) 1134; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx) 1135; AVX512-FCP-NEXT: vmovq %xmm6, (%rcx) 1136; AVX512-FCP-NEXT: vmovq %xmm3, (%r8) 1137; AVX512-FCP-NEXT: vmovq %xmm5, (%r9) 1138; AVX512-FCP-NEXT: vmovq %xmm0, (%rax) 1139; AVX512-FCP-NEXT: vzeroupper 1140; AVX512-FCP-NEXT: retq 1141; 1142; AVX512DQ-LABEL: load_i8_stride6_vf8: 1143; AVX512DQ: # %bb.0: 1144; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 1145; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 1146; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 1147; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] 1148; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 1149; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] 1150; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] 1151; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4 1152; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] 1153; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] 1154; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 1155; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] 1156; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm5 1157; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] 1158; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] 1159; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 1160; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] 1161; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] 1162; AVX512DQ-NEXT: vpor %xmm5, %xmm3, %xmm3 1163; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] 1164; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 1165; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] 1166; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] 1167; AVX512DQ-NEXT: vpor %xmm5, %xmm7, %xmm5 1168; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] 1169; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] 1170; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 1171; AVX512DQ-NEXT: vmovq %xmm4, (%rsi) 1172; AVX512DQ-NEXT: vmovq %xmm2, (%rdx) 1173; AVX512DQ-NEXT: vmovq %xmm6, (%rcx) 1174; AVX512DQ-NEXT: vmovq %xmm3, (%r8) 1175; AVX512DQ-NEXT: vmovq %xmm5, (%r9) 1176; AVX512DQ-NEXT: vmovq %xmm0, (%rax) 1177; AVX512DQ-NEXT: vzeroupper 1178; AVX512DQ-NEXT: retq 1179; 1180; AVX512DQ-FCP-LABEL: load_i8_stride6_vf8: 1181; AVX512DQ-FCP: # %bb.0: 1182; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1183; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 1184; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 1185; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] 1186; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 1187; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] 1188; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] 1189; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 1190; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] 1191; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] 1192; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 1193; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] 1194; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 1195; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] 1196; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] 1197; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 1198; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] 1199; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] 1200; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 1201; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] 1202; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 1203; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] 1204; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] 1205; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 1206; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] 1207; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] 1208; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 1209; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rsi) 1210; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx) 1211; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%rcx) 1212; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%r8) 1213; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r9) 1214; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax) 1215; AVX512DQ-FCP-NEXT: vzeroupper 1216; AVX512DQ-FCP-NEXT: retq 1217; 1218; AVX512BW-LABEL: load_i8_stride6_vf8: 1219; AVX512BW: # %bb.0: 1220; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 1221; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 1222; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 1223; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] 1224; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 1225; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] 1226; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] 1227; AVX512BW-NEXT: vpor %xmm4, %xmm5, %xmm4 1228; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] 1229; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] 1230; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 1231; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] 1232; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm5 1233; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] 1234; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] 1235; AVX512BW-NEXT: vpor %xmm6, %xmm7, %xmm6 1236; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] 1237; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] 1238; AVX512BW-NEXT: vpor %xmm5, %xmm3, %xmm3 1239; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] 1240; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 1241; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] 1242; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] 1243; AVX512BW-NEXT: vpor %xmm5, %xmm7, %xmm5 1244; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] 1245; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] 1246; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 1247; AVX512BW-NEXT: vmovq %xmm4, (%rsi) 1248; AVX512BW-NEXT: vmovq %xmm2, (%rdx) 1249; AVX512BW-NEXT: vmovq %xmm6, (%rcx) 1250; AVX512BW-NEXT: vmovq %xmm3, (%r8) 1251; AVX512BW-NEXT: vmovq %xmm5, (%r9) 1252; AVX512BW-NEXT: vmovq %xmm0, (%rax) 1253; AVX512BW-NEXT: vzeroupper 1254; AVX512BW-NEXT: retq 1255; 1256; AVX512BW-FCP-LABEL: load_i8_stride6_vf8: 1257; AVX512BW-FCP: # %bb.0: 1258; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1259; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 1260; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 1261; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] 1262; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 1263; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] 1264; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] 1265; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 1266; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] 1267; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] 1268; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 1269; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] 1270; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 1271; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] 1272; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] 1273; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 1274; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] 1275; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] 1276; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 1277; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] 1278; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 1279; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] 1280; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] 1281; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 1282; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] 1283; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] 1284; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 1285; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rsi) 1286; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx) 1287; AVX512BW-FCP-NEXT: vmovq %xmm6, (%rcx) 1288; AVX512BW-FCP-NEXT: vmovq %xmm3, (%r8) 1289; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9) 1290; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax) 1291; AVX512BW-FCP-NEXT: vzeroupper 1292; AVX512BW-FCP-NEXT: retq 1293; 1294; AVX512DQ-BW-LABEL: load_i8_stride6_vf8: 1295; AVX512DQ-BW: # %bb.0: 1296; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 1297; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 1298; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm1 1299; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] 1300; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 1301; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] 1302; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] 1303; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm5, %xmm4 1304; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] 1305; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] 1306; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 1307; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] 1308; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm5 1309; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] 1310; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] 1311; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm7, %xmm6 1312; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] 1313; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] 1314; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm3, %xmm3 1315; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] 1316; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1 1317; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] 1318; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] 1319; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm7, %xmm5 1320; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] 1321; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] 1322; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm0, %xmm0 1323; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rsi) 1324; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx) 1325; AVX512DQ-BW-NEXT: vmovq %xmm6, (%rcx) 1326; AVX512DQ-BW-NEXT: vmovq %xmm3, (%r8) 1327; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9) 1328; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax) 1329; AVX512DQ-BW-NEXT: vzeroupper 1330; AVX512DQ-BW-NEXT: retq 1331; 1332; AVX512DQ-BW-FCP-LABEL: load_i8_stride6_vf8: 1333; AVX512DQ-BW-FCP: # %bb.0: 1334; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1335; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 1336; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 1337; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] 1338; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 1339; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u] 1340; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u] 1341; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 1342; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u] 1343; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u] 1344; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 1345; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] 1346; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 1347; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] 1348; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u] 1349; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 1350; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u] 1351; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u] 1352; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 1353; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] 1354; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 1355; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] 1356; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u] 1357; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 1358; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] 1359; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u] 1360; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 1361; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rsi) 1362; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx) 1363; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%rcx) 1364; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%r8) 1365; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9) 1366; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax) 1367; AVX512DQ-BW-FCP-NEXT: vzeroupper 1368; AVX512DQ-BW-FCP-NEXT: retq 1369 %wide.vec = load <48 x i8>, ptr %in.vec, align 64 1370 %strided.vec0 = shufflevector <48 x i8> %wide.vec, <48 x i8> poison, <8 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42> 1371 %strided.vec1 = shufflevector <48 x i8> %wide.vec, <48 x i8> poison, <8 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43> 1372 %strided.vec2 = shufflevector <48 x i8> %wide.vec, <48 x i8> poison, <8 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44> 1373 %strided.vec3 = shufflevector <48 x i8> %wide.vec, <48 x i8> poison, <8 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45> 1374 %strided.vec4 = shufflevector <48 x i8> %wide.vec, <48 x i8> poison, <8 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46> 1375 %strided.vec5 = shufflevector <48 x i8> %wide.vec, <48 x i8> poison, <8 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47> 1376 store <8 x i8> %strided.vec0, ptr %out.vec0, align 64 1377 store <8 x i8> %strided.vec1, ptr %out.vec1, align 64 1378 store <8 x i8> %strided.vec2, ptr %out.vec2, align 64 1379 store <8 x i8> %strided.vec3, ptr %out.vec3, align 64 1380 store <8 x i8> %strided.vec4, ptr %out.vec4, align 64 1381 store <8 x i8> %strided.vec5, ptr %out.vec5, align 64 1382 ret void 1383} 1384 1385define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { 1386; SSE-LABEL: load_i8_stride6_vf16: 1387; SSE: # %bb.0: 1388; SSE-NEXT: movdqa 64(%rdi), %xmm10 1389; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1390; SSE-NEXT: movdqa (%rdi), %xmm5 1391; SSE-NEXT: movdqa 16(%rdi), %xmm1 1392; SSE-NEXT: movdqa 32(%rdi), %xmm7 1393; SSE-NEXT: movdqa 48(%rdi), %xmm6 1394; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535] 1395; SSE-NEXT: movdqa %xmm4, %xmm0 1396; SSE-NEXT: pandn %xmm7, %xmm0 1397; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,0] 1398; SSE-NEXT: movdqa %xmm2, %xmm3 1399; SSE-NEXT: pandn %xmm6, %xmm3 1400; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1401; SSE-NEXT: movdqa %xmm4, %xmm3 1402; SSE-NEXT: pandn %xmm6, %xmm3 1403; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1404; SSE-NEXT: pand %xmm4, %xmm6 1405; SSE-NEXT: por %xmm0, %xmm6 1406; SSE-NEXT: movdqa %xmm6, %xmm0 1407; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1408; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7] 1409; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] 1410; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] 1411; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] 1412; SSE-NEXT: packuswb %xmm3, %xmm0 1413; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535] 1414; SSE-NEXT: movdqa %xmm8, %xmm9 1415; SSE-NEXT: pandn %xmm0, %xmm9 1416; SSE-NEXT: movdqa %xmm2, %xmm0 1417; SSE-NEXT: movdqa %xmm2, %xmm11 1418; SSE-NEXT: pandn %xmm1, %xmm11 1419; SSE-NEXT: pand %xmm4, %xmm10 1420; SSE-NEXT: movdqa %xmm4, %xmm2 1421; SSE-NEXT: pandn %xmm1, %xmm2 1422; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1423; SSE-NEXT: movdqa %xmm1, %xmm2 1424; SSE-NEXT: movdqa %xmm5, %xmm14 1425; SSE-NEXT: pand %xmm4, %xmm14 1426; SSE-NEXT: movdqa 80(%rdi), %xmm3 1427; SSE-NEXT: movdqa %xmm3, %xmm13 1428; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1429; SSE-NEXT: pand %xmm4, %xmm13 1430; SSE-NEXT: movdqa %xmm7, %xmm15 1431; SSE-NEXT: pand %xmm4, %xmm7 1432; SSE-NEXT: pand %xmm4, %xmm2 1433; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1434; SSE-NEXT: movdqa %xmm4, %xmm12 1435; SSE-NEXT: movdqa %xmm4, %xmm2 1436; SSE-NEXT: pandn %xmm5, %xmm4 1437; SSE-NEXT: pand %xmm0, %xmm5 1438; SSE-NEXT: por %xmm11, %xmm5 1439; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,2,1,3] 1440; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 1441; SSE-NEXT: pand %xmm1, %xmm11 1442; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,6,7] 1443; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,1,3] 1444; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,1,4,5,6,7] 1445; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,7,6,7] 1446; SSE-NEXT: packuswb %xmm0, %xmm0 1447; SSE-NEXT: pand %xmm8, %xmm0 1448; SSE-NEXT: por %xmm9, %xmm0 1449; SSE-NEXT: pandn %xmm3, %xmm12 1450; SSE-NEXT: por %xmm12, %xmm10 1451; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[3,1,2,0] 1452; SSE-NEXT: pand %xmm1, %xmm9 1453; SSE-NEXT: movdqa %xmm1, %xmm3 1454; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7] 1455; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,0] 1456; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5] 1457; SSE-NEXT: packuswb %xmm9, %xmm9 1458; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] 1459; SSE-NEXT: movdqa %xmm11, %xmm12 1460; SSE-NEXT: pandn %xmm9, %xmm12 1461; SSE-NEXT: pand %xmm11, %xmm0 1462; SSE-NEXT: por %xmm0, %xmm12 1463; SSE-NEXT: pxor %xmm9, %xmm9 1464; SSE-NEXT: movdqa %xmm6, %xmm0 1465; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] 1466; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] 1467; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] 1468; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1469; SSE-NEXT: psrld $16, %xmm0 1470; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] 1471; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] 1472; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] 1473; SSE-NEXT: packuswb %xmm6, %xmm1 1474; SSE-NEXT: movdqa %xmm5, %xmm0 1475; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] 1476; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 1477; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] 1478; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] 1479; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] 1480; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] 1481; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] 1482; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7] 1483; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,0,0,65535,65535] 1484; SSE-NEXT: pand %xmm6, %xmm5 1485; SSE-NEXT: pandn %xmm0, %xmm6 1486; SSE-NEXT: por %xmm5, %xmm6 1487; SSE-NEXT: packuswb %xmm6, %xmm6 1488; SSE-NEXT: pand %xmm8, %xmm6 1489; SSE-NEXT: pandn %xmm1, %xmm8 1490; SSE-NEXT: por %xmm8, %xmm6 1491; SSE-NEXT: movdqa %xmm10, %xmm0 1492; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 1493; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] 1494; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] 1495; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[3,1,2,3,4,5,6,7] 1496; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] 1497; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4] 1498; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,0,65535] 1499; SSE-NEXT: pand %xmm5, %xmm1 1500; SSE-NEXT: pandn %xmm0, %xmm5 1501; SSE-NEXT: por %xmm1, %xmm5 1502; SSE-NEXT: packuswb %xmm5, %xmm0 1503; SSE-NEXT: movdqa %xmm11, %xmm10 1504; SSE-NEXT: pandn %xmm0, %xmm10 1505; SSE-NEXT: pand %xmm11, %xmm6 1506; SSE-NEXT: por %xmm6, %xmm10 1507; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm15 1508; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 1509; SSE-NEXT: movdqa %xmm15, %xmm0 1510; SSE-NEXT: pand %xmm3, %xmm0 1511; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] 1512; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1513; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] 1514; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] 1515; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 1516; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] 1517; SSE-NEXT: packuswb %xmm1, %xmm0 1518; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 1519; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[2,1,2,3,4,5,6,7] 1520; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] 1521; SSE-NEXT: pand %xmm3, %xmm1 1522; SSE-NEXT: movdqa %xmm3, %xmm8 1523; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] 1524; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] 1525; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 1526; SSE-NEXT: packuswb %xmm1, %xmm1 1527; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] 1528; SSE-NEXT: movdqa %xmm3, %xmm5 1529; SSE-NEXT: pandn %xmm1, %xmm5 1530; SSE-NEXT: pand %xmm3, %xmm0 1531; SSE-NEXT: por %xmm0, %xmm5 1532; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 1533; SSE-NEXT: pandn %xmm6, %xmm2 1534; SSE-NEXT: por %xmm2, %xmm13 1535; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,3,2,3,4,5,6,7] 1536; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] 1537; SSE-NEXT: pand %xmm8, %xmm0 1538; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] 1539; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] 1540; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] 1541; SSE-NEXT: packuswb %xmm0, %xmm0 1542; SSE-NEXT: movdqa %xmm11, %xmm8 1543; SSE-NEXT: pandn %xmm0, %xmm8 1544; SSE-NEXT: pand %xmm11, %xmm5 1545; SSE-NEXT: por %xmm5, %xmm8 1546; SSE-NEXT: movdqa %xmm15, %xmm0 1547; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 1548; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15] 1549; SSE-NEXT: movdqa %xmm15, %xmm1 1550; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] 1551; SSE-NEXT: movaps %xmm0, %xmm2 1552; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] 1553; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm15[0,0] 1554; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[2,3] 1555; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,5,6,7] 1556; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] 1557; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 1558; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] 1559; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] 1560; SSE-NEXT: packuswb %xmm0, %xmm1 1561; SSE-NEXT: movdqa %xmm14, %xmm0 1562; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 1563; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] 1564; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] 1565; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15] 1566; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,3,2,1] 1567; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] 1568; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] 1569; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,0,65535,65535,65535,65535] 1570; SSE-NEXT: pand %xmm5, %xmm2 1571; SSE-NEXT: pandn %xmm0, %xmm5 1572; SSE-NEXT: por %xmm2, %xmm5 1573; SSE-NEXT: pand %xmm3, %xmm1 1574; SSE-NEXT: packuswb %xmm5, %xmm5 1575; SSE-NEXT: pandn %xmm5, %xmm3 1576; SSE-NEXT: por %xmm1, %xmm3 1577; SSE-NEXT: movdqa %xmm13, %xmm0 1578; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] 1579; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] 1580; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] 1581; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] 1582; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,2,0,3] 1583; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] 1584; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,0] 1585; SSE-NEXT: pand %xmm2, %xmm1 1586; SSE-NEXT: pandn %xmm0, %xmm2 1587; SSE-NEXT: por %xmm1, %xmm2 1588; SSE-NEXT: pand %xmm11, %xmm3 1589; SSE-NEXT: packuswb %xmm2, %xmm0 1590; SSE-NEXT: pandn %xmm0, %xmm11 1591; SSE-NEXT: por %xmm3, %xmm11 1592; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 1593; SSE-NEXT: movdqa %xmm7, %xmm0 1594; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 1595; SSE-NEXT: pand %xmm5, %xmm0 1596; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] 1597; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] 1598; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] 1599; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] 1600; SSE-NEXT: packuswb %xmm1, %xmm0 1601; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] 1602; SSE-NEXT: movdqa %xmm3, %xmm1 1603; SSE-NEXT: pandn %xmm0, %xmm1 1604; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 1605; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,1,2,0] 1606; SSE-NEXT: pand %xmm5, %xmm0 1607; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] 1608; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 1609; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,1,0,3,4,5,6,7] 1610; SSE-NEXT: packuswb %xmm2, %xmm2 1611; SSE-NEXT: pand %xmm3, %xmm2 1612; SSE-NEXT: por %xmm1, %xmm2 1613; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,0,65535,65535,0] 1614; SSE-NEXT: movdqa %xmm6, %xmm1 1615; SSE-NEXT: pand %xmm13, %xmm1 1616; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 1617; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] 1618; SSE-NEXT: pand %xmm0, %xmm2 1619; SSE-NEXT: por %xmm1, %xmm13 1620; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,2,1,3] 1621; SSE-NEXT: pand %xmm5, %xmm1 1622; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] 1623; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] 1624; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] 1625; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] 1626; SSE-NEXT: packuswb %xmm1, %xmm1 1627; SSE-NEXT: movdqa %xmm0, %xmm6 1628; SSE-NEXT: pandn %xmm1, %xmm6 1629; SSE-NEXT: por %xmm2, %xmm6 1630; SSE-NEXT: movdqa %xmm7, %xmm1 1631; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] 1632; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] 1633; SSE-NEXT: movdqa %xmm7, %xmm2 1634; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm1[0,0] 1635; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm1[2,3] 1636; SSE-NEXT: psrlq $48, %xmm1 1637; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1638; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1639; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7] 1640; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] 1641; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7] 1642; SSE-NEXT: packuswb %xmm2, %xmm1 1643; SSE-NEXT: movdqa %xmm4, %xmm2 1644; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] 1645; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] 1646; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,5,5,5,5] 1647; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,65535] 1648; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] 1649; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] 1650; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1651; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,1,2,4,5,6,7] 1652; SSE-NEXT: pand %xmm2, %xmm4 1653; SSE-NEXT: pandn %xmm5, %xmm2 1654; SSE-NEXT: por %xmm4, %xmm2 1655; SSE-NEXT: packuswb %xmm2, %xmm2 1656; SSE-NEXT: pand %xmm3, %xmm2 1657; SSE-NEXT: pandn %xmm1, %xmm3 1658; SSE-NEXT: por %xmm3, %xmm2 1659; SSE-NEXT: movdqa %xmm13, %xmm1 1660; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] 1661; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] 1662; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] 1663; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] 1664; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,0,0] 1665; SSE-NEXT: pand %xmm3, %xmm1 1666; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,7,5,6,7] 1667; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] 1668; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,7,4] 1669; SSE-NEXT: pandn %xmm4, %xmm3 1670; SSE-NEXT: por %xmm1, %xmm3 1671; SSE-NEXT: pand %xmm0, %xmm2 1672; SSE-NEXT: packuswb %xmm3, %xmm1 1673; SSE-NEXT: pandn %xmm1, %xmm0 1674; SSE-NEXT: por %xmm2, %xmm0 1675; SSE-NEXT: movdqa %xmm12, (%rsi) 1676; SSE-NEXT: movdqa %xmm10, (%rdx) 1677; SSE-NEXT: movdqa %xmm8, (%rcx) 1678; SSE-NEXT: movdqa %xmm11, (%r8) 1679; SSE-NEXT: movdqa %xmm6, (%r9) 1680; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 1681; SSE-NEXT: movdqa %xmm0, (%rax) 1682; SSE-NEXT: retq 1683; 1684; AVX-LABEL: load_i8_stride6_vf16: 1685; AVX: # %bb.0: 1686; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 1687; AVX-NEXT: vmovdqa (%rdi), %xmm1 1688; AVX-NEXT: vmovdqa 16(%rdi), %xmm2 1689; AVX-NEXT: vmovdqa 32(%rdi), %xmm0 1690; AVX-NEXT: vmovdqa 48(%rdi), %xmm3 1691; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u] 1692; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u] 1693; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 1694; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[2,8,14,u,u,u,u,u,u,u,u,u,u] 1695; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u] 1696; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 1697; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7] 1698; AVX-NEXT: vmovdqa 80(%rdi), %xmm4 1699; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[4,10] 1700; AVX-NEXT: vmovdqa 64(%rdi), %xmm5 1701; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero 1702; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7 1703; AVX-NEXT: vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215] 1704; AVX-NEXT: vpblendvb %xmm9, %xmm6, %xmm7, %xmm6 1705; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u] 1706; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u] 1707; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] 1708; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm2[3,9,15,u,u,u,u,u,u,u,u,u,u] 1709; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u] 1710; AVX-NEXT: vpor %xmm8, %xmm10, %xmm8 1711; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5],xmm8[6,7] 1712; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[5,11] 1713; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero 1714; AVX-NEXT: vpor %xmm8, %xmm10, %xmm8 1715; AVX-NEXT: vpblendvb %xmm9, %xmm7, %xmm8, %xmm7 1716; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u] 1717; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,0,6,12,u,u,u,u,u,u,u,u] 1718; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm10[0],xmm8[0] 1719; AVX-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u] 1720; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] 1721; AVX-NEXT: vpor %xmm10, %xmm11, %xmm10 1722; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u] 1723; AVX-NEXT: vpblendvb %xmm11, %xmm8, %xmm10, %xmm8 1724; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero 1725; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[0,6,12] 1726; AVX-NEXT: vpor %xmm10, %xmm12, %xmm10 1727; AVX-NEXT: vpblendvb %xmm9, %xmm8, %xmm10, %xmm8 1728; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u] 1729; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,1,7,13,u,u,u,u,u,u,u,u] 1730; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm12[0],xmm10[0] 1731; AVX-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u] 1732; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] 1733; AVX-NEXT: vpor %xmm12, %xmm13, %xmm12 1734; AVX-NEXT: vpblendvb %xmm11, %xmm10, %xmm12, %xmm10 1735; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero 1736; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[1,7,13] 1737; AVX-NEXT: vpor %xmm11, %xmm12, %xmm11 1738; AVX-NEXT: vpblendvb %xmm9, %xmm10, %xmm11, %xmm9 1739; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[4,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] 1740; AVX-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u] 1741; AVX-NEXT: vpor %xmm10, %xmm11, %xmm10 1742; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] 1743; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] 1744; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm12[1],xmm11[1] 1745; AVX-NEXT: vmovq {{.*#+}} xmm12 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] 1746; AVX-NEXT: vpblendvb %xmm12, %xmm10, %xmm11, %xmm10 1747; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[2,8,14] 1748; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero 1749; AVX-NEXT: vpor %xmm11, %xmm13, %xmm11 1750; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm11[5,6,7] 1751; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] 1752; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u,u,u,u] 1753; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 1754; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] 1755; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] 1756; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] 1757; AVX-NEXT: vpblendvb %xmm12, %xmm1, %xmm0, %xmm0 1758; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[3,9,15] 1759; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero 1760; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 1761; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] 1762; AVX-NEXT: vmovdqa %xmm6, (%rsi) 1763; AVX-NEXT: vmovdqa %xmm7, (%rdx) 1764; AVX-NEXT: vmovdqa %xmm8, (%rcx) 1765; AVX-NEXT: vmovdqa %xmm9, (%r8) 1766; AVX-NEXT: vmovdqa %xmm10, (%r9) 1767; AVX-NEXT: vmovdqa %xmm0, (%rax) 1768; AVX-NEXT: retq 1769; 1770; AVX2-LABEL: load_i8_stride6_vf16: 1771; AVX2: # %bb.0: 1772; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 1773; AVX2-NEXT: vmovdqa (%rdi), %ymm3 1774; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4 1775; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] 1776; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm5 1777; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] 1778; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 1779; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u] 1780; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm2 1781; AVX2-NEXT: vmovdqa 80(%rdi), %xmm0 1782; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[4,10] 1783; AVX2-NEXT: vmovdqa 64(%rdi), %xmm1 1784; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero 1785; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7 1786; AVX2-NEXT: vpmovsxdq {{.*#+}} xmm8 = [18446744073709551615,16777215] 1787; AVX2-NEXT: vpblendvb %xmm8, %xmm2, %xmm7, %xmm2 1788; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] 1789; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] 1790; AVX2-NEXT: vpor %xmm5, %xmm6, %xmm5 1791; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,11] 1792; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero 1793; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6 1794; AVX2-NEXT: vpblendvb %xmm8, %xmm5, %xmm6, %xmm5 1795; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] 1796; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm6 1797; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 1798; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[2,8,14,u,u,u,u,u] 1799; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[2,8,14],zero,zero,xmm6[0,6,12],zero,zero,zero,xmm6[u,u,u,u,u] 1800; AVX2-NEXT: vpor %xmm9, %xmm10, %xmm9 1801; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero 1802; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,6,12] 1803; AVX2-NEXT: vpor %xmm10, %xmm11, %xmm10 1804; AVX2-NEXT: vpblendvb %xmm8, %xmm9, %xmm10, %xmm9 1805; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[3,9,15,u,u,u,u,u] 1806; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,9,15],zero,zero,xmm6[1,7,13],zero,zero,zero,xmm6[u,u,u,u,u] 1807; AVX2-NEXT: vpor %xmm7, %xmm6, %xmm6 1808; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero 1809; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,7,13] 1810; AVX2-NEXT: vpor %xmm7, %xmm10, %xmm7 1811; AVX2-NEXT: vpblendvb %xmm8, %xmm6, %xmm7, %xmm6 1812; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] 1813; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 1814; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 1815; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u] 1816; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[4,10],zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u] 1817; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7 1818; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,8,14] 1819; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero 1820; AVX2-NEXT: vpor %xmm8, %xmm10, %xmm8 1821; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] 1822; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u] 1823; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,11],zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u] 1824; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 1825; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,9,15] 1826; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero 1827; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 1828; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] 1829; AVX2-NEXT: vmovdqa %xmm2, (%rsi) 1830; AVX2-NEXT: vmovdqa %xmm5, (%rdx) 1831; AVX2-NEXT: vmovdqa %xmm9, (%rcx) 1832; AVX2-NEXT: vmovdqa %xmm6, (%r8) 1833; AVX2-NEXT: vmovdqa %xmm7, (%r9) 1834; AVX2-NEXT: vmovdqa %xmm0, (%rax) 1835; AVX2-NEXT: vzeroupper 1836; AVX2-NEXT: retq 1837; 1838; AVX2-FP-LABEL: load_i8_stride6_vf16: 1839; AVX2-FP: # %bb.0: 1840; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1841; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3 1842; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm4 1843; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] 1844; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm5 1845; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] 1846; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6 1847; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u] 1848; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm2 1849; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm0 1850; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[4,10] 1851; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm1 1852; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero 1853; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm7 1854; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} xmm8 = [18446744073709551615,16777215] 1855; AVX2-FP-NEXT: vpblendvb %xmm8, %xmm2, %xmm7, %xmm2 1856; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] 1857; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] 1858; AVX2-FP-NEXT: vpor %xmm5, %xmm6, %xmm5 1859; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,11] 1860; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero 1861; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6 1862; AVX2-FP-NEXT: vpblendvb %xmm8, %xmm5, %xmm6, %xmm5 1863; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] 1864; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm6 1865; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7 1866; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[2,8,14,u,u,u,u,u] 1867; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[2,8,14],zero,zero,xmm6[0,6,12],zero,zero,zero,xmm6[u,u,u,u,u] 1868; AVX2-FP-NEXT: vpor %xmm9, %xmm10, %xmm9 1869; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero 1870; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,6,12] 1871; AVX2-FP-NEXT: vpor %xmm10, %xmm11, %xmm10 1872; AVX2-FP-NEXT: vpblendvb %xmm8, %xmm9, %xmm10, %xmm9 1873; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[3,9,15,u,u,u,u,u] 1874; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,9,15],zero,zero,xmm6[1,7,13],zero,zero,zero,xmm6[u,u,u,u,u] 1875; AVX2-FP-NEXT: vpor %xmm7, %xmm6, %xmm6 1876; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero 1877; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,7,13] 1878; AVX2-FP-NEXT: vpor %xmm7, %xmm10, %xmm7 1879; AVX2-FP-NEXT: vpblendvb %xmm8, %xmm6, %xmm7, %xmm6 1880; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] 1881; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 1882; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 1883; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u] 1884; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[4,10],zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u] 1885; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm7 1886; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,8,14] 1887; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero 1888; AVX2-FP-NEXT: vpor %xmm8, %xmm10, %xmm8 1889; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] 1890; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u] 1891; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,11],zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u] 1892; AVX2-FP-NEXT: vpor %xmm4, %xmm3, %xmm3 1893; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,9,15] 1894; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero 1895; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm0 1896; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] 1897; AVX2-FP-NEXT: vmovdqa %xmm2, (%rsi) 1898; AVX2-FP-NEXT: vmovdqa %xmm5, (%rdx) 1899; AVX2-FP-NEXT: vmovdqa %xmm9, (%rcx) 1900; AVX2-FP-NEXT: vmovdqa %xmm6, (%r8) 1901; AVX2-FP-NEXT: vmovdqa %xmm7, (%r9) 1902; AVX2-FP-NEXT: vmovdqa %xmm0, (%rax) 1903; AVX2-FP-NEXT: vzeroupper 1904; AVX2-FP-NEXT: retq 1905; 1906; AVX2-FCP-LABEL: load_i8_stride6_vf16: 1907; AVX2-FCP: # %bb.0: 1908; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 1909; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3 1910; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 1911; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] 1912; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm5 1913; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] 1914; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 1915; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u] 1916; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm2 1917; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm0 1918; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[4,10] 1919; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 1920; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero 1921; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 1922; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} xmm8 = [18446744073709551615,16777215] 1923; AVX2-FCP-NEXT: vpblendvb %xmm8, %xmm2, %xmm7, %xmm2 1924; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] 1925; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] 1926; AVX2-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 1927; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,11] 1928; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero 1929; AVX2-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 1930; AVX2-FCP-NEXT: vpblendvb %xmm8, %xmm5, %xmm6, %xmm5 1931; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] 1932; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm6 1933; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 1934; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[2,8,14,u,u,u,u,u] 1935; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[2,8,14],zero,zero,xmm6[0,6,12],zero,zero,zero,xmm6[u,u,u,u,u] 1936; AVX2-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 1937; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero 1938; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,6,12] 1939; AVX2-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 1940; AVX2-FCP-NEXT: vpblendvb %xmm8, %xmm9, %xmm10, %xmm9 1941; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[3,9,15,u,u,u,u,u] 1942; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,9,15],zero,zero,xmm6[1,7,13],zero,zero,zero,xmm6[u,u,u,u,u] 1943; AVX2-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 1944; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero 1945; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,7,13] 1946; AVX2-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 1947; AVX2-FCP-NEXT: vpblendvb %xmm8, %xmm6, %xmm7, %xmm6 1948; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] 1949; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 1950; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 1951; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u] 1952; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[4,10],zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u] 1953; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 1954; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,8,14] 1955; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero 1956; AVX2-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8 1957; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] 1958; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u] 1959; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,11],zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u] 1960; AVX2-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 1961; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,9,15] 1962; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero 1963; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 1964; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] 1965; AVX2-FCP-NEXT: vmovdqa %xmm2, (%rsi) 1966; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rdx) 1967; AVX2-FCP-NEXT: vmovdqa %xmm9, (%rcx) 1968; AVX2-FCP-NEXT: vmovdqa %xmm6, (%r8) 1969; AVX2-FCP-NEXT: vmovdqa %xmm7, (%r9) 1970; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rax) 1971; AVX2-FCP-NEXT: vzeroupper 1972; AVX2-FCP-NEXT: retq 1973; 1974; AVX512-LABEL: load_i8_stride6_vf16: 1975; AVX512: # %bb.0: 1976; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 1977; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] 1978; AVX512-NEXT: vmovdqa (%rdi), %ymm3 1979; AVX512-NEXT: vmovdqa 32(%rdi), %ymm4 1980; AVX512-NEXT: vmovdqa %ymm0, %ymm5 1981; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm4 ^ (ymm5 & (ymm3 ^ ymm4)) 1982; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] 1983; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 1984; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u] 1985; AVX512-NEXT: vpor %xmm1, %xmm2, %xmm7 1986; AVX512-NEXT: vmovdqa 80(%rdi), %xmm2 1987; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[4,10] 1988; AVX512-NEXT: vmovdqa 64(%rdi), %xmm1 1989; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero 1990; AVX512-NEXT: vpor %xmm8, %xmm9, %xmm8 1991; AVX512-NEXT: vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215] 1992; AVX512-NEXT: vpternlogq {{.*#+}} xmm8 = xmm8 ^ (xmm9 & (xmm8 ^ xmm7)) 1993; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] 1994; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] 1995; AVX512-NEXT: vpor %xmm5, %xmm6, %xmm5 1996; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11] 1997; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero 1998; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 1999; AVX512-NEXT: vpternlogq {{.*#+}} xmm6 = xmm6 ^ (xmm9 & (xmm6 ^ xmm5)) 2000; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero 2001; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12] 2002; AVX512-NEXT: vpor %xmm5, %xmm7, %xmm5 2003; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] 2004; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm3 ^ (ymm7 & (ymm4 ^ ymm3)) 2005; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm10 2006; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] 2007; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[2,8,14],zero,zero,xmm7[0,6,12],zero,zero,zero,xmm7[u,u,u,u,u] 2008; AVX512-NEXT: vpor %xmm11, %xmm12, %xmm11 2009; AVX512-NEXT: vpternlogq {{.*#+}} xmm11 = xmm5 ^ (xmm9 & (xmm11 ^ xmm5)) 2010; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero 2011; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13] 2012; AVX512-NEXT: vpor %xmm5, %xmm12, %xmm5 2013; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u] 2014; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u] 2015; AVX512-NEXT: vpor %xmm7, %xmm10, %xmm7 2016; AVX512-NEXT: vpternlogq {{.*#+}} xmm7 = xmm5 ^ (xmm9 & (xmm7 ^ xmm5)) 2017; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] 2018; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm9 2019; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm3 ^ (ymm0 & (ymm4 ^ ymm3)) 2020; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 2021; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm4 2022; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7] 2023; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128] 2024; AVX512-NEXT: vpshufb %xmm5, %xmm1, %xmm9 2025; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm5 2026; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm9[5,6,7] 2027; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4 2028; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15] 2029; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm2 2030; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm3 2031; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] 2032; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128] 2033; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2034; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2035; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] 2036; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 2037; AVX512-NEXT: vmovdqa %xmm8, (%rsi) 2038; AVX512-NEXT: vmovdqa %xmm6, (%rdx) 2039; AVX512-NEXT: vmovdqa %xmm11, (%rcx) 2040; AVX512-NEXT: vmovdqa %xmm7, (%r8) 2041; AVX512-NEXT: vmovdqa %xmm4, (%r9) 2042; AVX512-NEXT: vmovdqa %xmm0, (%rax) 2043; AVX512-NEXT: vzeroupper 2044; AVX512-NEXT: retq 2045; 2046; AVX512-FCP-LABEL: load_i8_stride6_vf16: 2047; AVX512-FCP: # %bb.0: 2048; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2049; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] 2050; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm3 2051; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 2052; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm5 2053; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm4 ^ (ymm5 & (ymm3 ^ ymm4)) 2054; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] 2055; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 2056; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u] 2057; AVX512-FCP-NEXT: vpor %xmm1, %xmm2, %xmm7 2058; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm2 2059; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[4,10] 2060; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 2061; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero 2062; AVX512-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 2063; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215] 2064; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm8 = xmm8 ^ (xmm9 & (xmm8 ^ xmm7)) 2065; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] 2066; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] 2067; AVX512-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 2068; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11] 2069; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero 2070; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 2071; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm6 = xmm6 ^ (xmm9 & (xmm6 ^ xmm5)) 2072; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero 2073; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12] 2074; AVX512-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 2075; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] 2076; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm3 ^ (ymm7 & (ymm4 ^ ymm3)) 2077; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm10 2078; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] 2079; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[2,8,14],zero,zero,xmm7[0,6,12],zero,zero,zero,xmm7[u,u,u,u,u] 2080; AVX512-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11 2081; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm11 = xmm5 ^ (xmm9 & (xmm11 ^ xmm5)) 2082; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero 2083; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13] 2084; AVX512-FCP-NEXT: vpor %xmm5, %xmm12, %xmm5 2085; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u] 2086; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u] 2087; AVX512-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 2088; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm7 = xmm5 ^ (xmm9 & (xmm7 ^ xmm5)) 2089; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] 2090; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm9 2091; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm3 ^ (ymm0 & (ymm4 ^ ymm3)) 2092; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 2093; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm4 2094; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7] 2095; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128] 2096; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm9 2097; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 2098; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm9[5,6,7] 2099; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 2100; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15] 2101; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 2102; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 2103; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] 2104; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128] 2105; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2106; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2107; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] 2108; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 2109; AVX512-FCP-NEXT: vmovdqa %xmm8, (%rsi) 2110; AVX512-FCP-NEXT: vmovdqa %xmm6, (%rdx) 2111; AVX512-FCP-NEXT: vmovdqa %xmm11, (%rcx) 2112; AVX512-FCP-NEXT: vmovdqa %xmm7, (%r8) 2113; AVX512-FCP-NEXT: vmovdqa %xmm4, (%r9) 2114; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rax) 2115; AVX512-FCP-NEXT: vzeroupper 2116; AVX512-FCP-NEXT: retq 2117; 2118; AVX512DQ-LABEL: load_i8_stride6_vf16: 2119; AVX512DQ: # %bb.0: 2120; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 2121; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] 2122; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 2123; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm4 2124; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm5 2125; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm4 ^ (ymm5 & (ymm3 ^ ymm4)) 2126; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] 2127; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 2128; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u] 2129; AVX512DQ-NEXT: vpor %xmm1, %xmm2, %xmm7 2130; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm2 2131; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[4,10] 2132; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm1 2133; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero 2134; AVX512DQ-NEXT: vpor %xmm8, %xmm9, %xmm8 2135; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215] 2136; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm8 = xmm8 ^ (xmm9 & (xmm8 ^ xmm7)) 2137; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] 2138; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] 2139; AVX512DQ-NEXT: vpor %xmm5, %xmm6, %xmm5 2140; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11] 2141; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero 2142; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 2143; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm6 = xmm6 ^ (xmm9 & (xmm6 ^ xmm5)) 2144; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero 2145; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12] 2146; AVX512DQ-NEXT: vpor %xmm5, %xmm7, %xmm5 2147; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] 2148; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm3 ^ (ymm7 & (ymm4 ^ ymm3)) 2149; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm10 2150; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] 2151; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[2,8,14],zero,zero,xmm7[0,6,12],zero,zero,zero,xmm7[u,u,u,u,u] 2152; AVX512DQ-NEXT: vpor %xmm11, %xmm12, %xmm11 2153; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm11 = xmm5 ^ (xmm9 & (xmm11 ^ xmm5)) 2154; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero 2155; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13] 2156; AVX512DQ-NEXT: vpor %xmm5, %xmm12, %xmm5 2157; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u] 2158; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u] 2159; AVX512DQ-NEXT: vpor %xmm7, %xmm10, %xmm7 2160; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm7 = xmm5 ^ (xmm9 & (xmm7 ^ xmm5)) 2161; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] 2162; AVX512DQ-NEXT: vpshufb %xmm5, %xmm2, %xmm9 2163; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm3 ^ (ymm0 & (ymm4 ^ ymm3)) 2164; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 2165; AVX512DQ-NEXT: vpshufb %xmm5, %xmm3, %xmm4 2166; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7] 2167; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128] 2168; AVX512DQ-NEXT: vpshufb %xmm5, %xmm1, %xmm9 2169; AVX512DQ-NEXT: vpshufb %xmm5, %xmm0, %xmm5 2170; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm9[5,6,7] 2171; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4 2172; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15] 2173; AVX512DQ-NEXT: vpshufb %xmm5, %xmm2, %xmm2 2174; AVX512DQ-NEXT: vpshufb %xmm5, %xmm3, %xmm3 2175; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] 2176; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128] 2177; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2178; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2179; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] 2180; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0 2181; AVX512DQ-NEXT: vmovdqa %xmm8, (%rsi) 2182; AVX512DQ-NEXT: vmovdqa %xmm6, (%rdx) 2183; AVX512DQ-NEXT: vmovdqa %xmm11, (%rcx) 2184; AVX512DQ-NEXT: vmovdqa %xmm7, (%r8) 2185; AVX512DQ-NEXT: vmovdqa %xmm4, (%r9) 2186; AVX512DQ-NEXT: vmovdqa %xmm0, (%rax) 2187; AVX512DQ-NEXT: vzeroupper 2188; AVX512DQ-NEXT: retq 2189; 2190; AVX512DQ-FCP-LABEL: load_i8_stride6_vf16: 2191; AVX512DQ-FCP: # %bb.0: 2192; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2193; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] 2194; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm3 2195; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 2196; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm5 2197; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm4 ^ (ymm5 & (ymm3 ^ ymm4)) 2198; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u] 2199; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 2200; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u] 2201; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm2, %xmm7 2202; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm2 2203; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[4,10] 2204; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 2205; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero 2206; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 2207; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215] 2208; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm8 = xmm8 ^ (xmm9 & (xmm8 ^ xmm7)) 2209; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] 2210; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] 2211; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 2212; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11] 2213; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero 2214; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 2215; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm6 = xmm6 ^ (xmm9 & (xmm6 ^ xmm5)) 2216; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero 2217; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12] 2218; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 2219; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] 2220; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm3 ^ (ymm7 & (ymm4 ^ ymm3)) 2221; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm10 2222; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] 2223; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[2,8,14],zero,zero,xmm7[0,6,12],zero,zero,zero,xmm7[u,u,u,u,u] 2224; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11 2225; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm11 = xmm5 ^ (xmm9 & (xmm11 ^ xmm5)) 2226; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero 2227; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13] 2228; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm12, %xmm5 2229; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u] 2230; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u] 2231; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 2232; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm7 = xmm5 ^ (xmm9 & (xmm7 ^ xmm5)) 2233; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] 2234; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm9 2235; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm3 ^ (ymm0 & (ymm4 ^ ymm3)) 2236; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 2237; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm4 2238; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7] 2239; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128] 2240; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm9 2241; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 2242; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm9[5,6,7] 2243; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 2244; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15] 2245; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 2246; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 2247; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] 2248; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128] 2249; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2250; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2251; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] 2252; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 2253; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, (%rsi) 2254; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%rdx) 2255; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, (%rcx) 2256; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, (%r8) 2257; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%r9) 2258; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rax) 2259; AVX512DQ-FCP-NEXT: vzeroupper 2260; AVX512DQ-FCP-NEXT: retq 2261; 2262; AVX512BW-LABEL: load_i8_stride6_vf16: 2263; AVX512BW: # %bb.0: 2264; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 2265; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 2266; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0 2267; AVX512BW-NEXT: movw $18724, %r10w # imm = 0x4924 2268; AVX512BW-NEXT: kmovd %r10d, %k1 2269; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} 2270; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[u,u,u,u,u] 2271; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm4 2272; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm4[2,8,14],zero,zero,xmm4[0,6,12,u,u,u,u,u] 2273; AVX512BW-NEXT: vpor %xmm3, %xmm5, %xmm3 2274; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm5 2275; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[4,10] 2276; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm7 2277; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero 2278; AVX512BW-NEXT: vpor %xmm6, %xmm8, %xmm6 2279; AVX512BW-NEXT: movw $-2048, %di # imm = 0xF800 2280; AVX512BW-NEXT: kmovd %edi, %k2 2281; AVX512BW-NEXT: vmovdqu8 %xmm6, %xmm3 {%k2} 2282; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[u,u,u,u,u] 2283; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[3,9,15],zero,zero,xmm4[1,7,13,u,u,u,u,u] 2284; AVX512BW-NEXT: vpor %xmm2, %xmm4, %xmm2 2285; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,11] 2286; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero 2287; AVX512BW-NEXT: vpor %xmm4, %xmm6, %xmm4 2288; AVX512BW-NEXT: vmovdqu8 %xmm4, %xmm2 {%k2} 2289; AVX512BW-NEXT: movw $9362, %di # imm = 0x2492 2290; AVX512BW-NEXT: kmovd %edi, %k3 2291; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k3} 2292; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm6 2293; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14,u,u,u,u,u] 2294; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[u,u,u,u,u] 2295; AVX512BW-NEXT: vpor %xmm8, %xmm9, %xmm8 2296; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero 2297; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,6,12] 2298; AVX512BW-NEXT: vpor %xmm9, %xmm10, %xmm9 2299; AVX512BW-NEXT: vmovdqu8 %xmm9, %xmm8 {%k2} 2300; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15,u,u,u,u,u] 2301; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[u,u,u,u,u] 2302; AVX512BW-NEXT: vpor %xmm6, %xmm4, %xmm4 2303; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero 2304; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,7,13] 2305; AVX512BW-NEXT: vpor %xmm6, %xmm9, %xmm6 2306; AVX512BW-NEXT: vmovdqu8 %xmm6, %xmm4 {%k2} 2307; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] 2308; AVX512BW-NEXT: vpshufb %xmm6, %xmm5, %xmm9 2309; AVX512BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} 2310; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 2311; AVX512BW-NEXT: vpshufb %xmm6, %xmm1, %xmm6 2312; AVX512BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm9[5,6,7] 2313; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128] 2314; AVX512BW-NEXT: vpshufb %xmm9, %xmm7, %xmm10 2315; AVX512BW-NEXT: vpshufb %xmm9, %xmm0, %xmm9 2316; AVX512BW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7] 2317; AVX512BW-NEXT: vpor %xmm6, %xmm9, %xmm6 2318; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15] 2319; AVX512BW-NEXT: vpshufb %xmm9, %xmm5, %xmm5 2320; AVX512BW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 2321; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] 2322; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128] 2323; AVX512BW-NEXT: vpshufb %xmm5, %xmm7, %xmm7 2324; AVX512BW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 2325; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7] 2326; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 2327; AVX512BW-NEXT: vmovdqa %xmm3, (%rsi) 2328; AVX512BW-NEXT: vmovdqa %xmm2, (%rdx) 2329; AVX512BW-NEXT: vmovdqa %xmm8, (%rcx) 2330; AVX512BW-NEXT: vmovdqa %xmm4, (%r8) 2331; AVX512BW-NEXT: vmovdqa %xmm6, (%r9) 2332; AVX512BW-NEXT: vmovdqa %xmm0, (%rax) 2333; AVX512BW-NEXT: vzeroupper 2334; AVX512BW-NEXT: retq 2335; 2336; AVX512BW-FCP-LABEL: load_i8_stride6_vf16: 2337; AVX512BW-FCP: # %bb.0: 2338; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2339; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 2340; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 2341; AVX512BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 2342; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 2343; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} 2344; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[u,u,u,u,u] 2345; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 2346; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm4[2,8,14],zero,zero,xmm4[0,6,12,u,u,u,u,u] 2347; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 2348; AVX512BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm5 2349; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[4,10] 2350; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm7 2351; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero 2352; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6 2353; AVX512BW-FCP-NEXT: movw $-2048, %di # imm = 0xF800 2354; AVX512BW-FCP-NEXT: kmovd %edi, %k2 2355; AVX512BW-FCP-NEXT: vmovdqu8 %xmm6, %xmm3 {%k2} 2356; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[u,u,u,u,u] 2357; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[3,9,15],zero,zero,xmm4[1,7,13,u,u,u,u,u] 2358; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2 2359; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,11] 2360; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero 2361; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm6, %xmm4 2362; AVX512BW-FCP-NEXT: vmovdqu8 %xmm4, %xmm2 {%k2} 2363; AVX512BW-FCP-NEXT: movw $9362, %di # imm = 0x2492 2364; AVX512BW-FCP-NEXT: kmovd %edi, %k3 2365; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k3} 2366; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 2367; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14,u,u,u,u,u] 2368; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[u,u,u,u,u] 2369; AVX512BW-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 2370; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero 2371; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,6,12] 2372; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 2373; AVX512BW-FCP-NEXT: vmovdqu8 %xmm9, %xmm8 {%k2} 2374; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15,u,u,u,u,u] 2375; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[u,u,u,u,u] 2376; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4 2377; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero 2378; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,7,13] 2379; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 2380; AVX512BW-FCP-NEXT: vmovdqu8 %xmm6, %xmm4 {%k2} 2381; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] 2382; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm9 2383; AVX512BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} 2384; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 2385; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 2386; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm9[5,6,7] 2387; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128] 2388; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm10 2389; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm9 2390; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7] 2391; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 2392; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15] 2393; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm5 2394; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 2395; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] 2396; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128] 2397; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm7 2398; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 2399; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7] 2400; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 2401; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rsi) 2402; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rdx) 2403; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%rcx) 2404; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%r8) 2405; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%r9) 2406; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rax) 2407; AVX512BW-FCP-NEXT: vzeroupper 2408; AVX512BW-FCP-NEXT: retq 2409; 2410; AVX512DQ-BW-LABEL: load_i8_stride6_vf16: 2411; AVX512DQ-BW: # %bb.0: 2412; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 2413; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 2414; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm0 2415; AVX512DQ-BW-NEXT: movw $18724, %r10w # imm = 0x4924 2416; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 2417; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} 2418; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[u,u,u,u,u] 2419; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm4 2420; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm4[2,8,14],zero,zero,xmm4[0,6,12,u,u,u,u,u] 2421; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm5, %xmm3 2422; AVX512DQ-BW-NEXT: vmovdqa 80(%rdi), %xmm5 2423; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[4,10] 2424; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm7 2425; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero 2426; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm8, %xmm6 2427; AVX512DQ-BW-NEXT: movw $-2048, %di # imm = 0xF800 2428; AVX512DQ-BW-NEXT: kmovd %edi, %k2 2429; AVX512DQ-BW-NEXT: vmovdqu8 %xmm6, %xmm3 {%k2} 2430; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[u,u,u,u,u] 2431; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[3,9,15],zero,zero,xmm4[1,7,13,u,u,u,u,u] 2432; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm4, %xmm2 2433; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,11] 2434; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero 2435; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm6, %xmm4 2436; AVX512DQ-BW-NEXT: vmovdqu8 %xmm4, %xmm2 {%k2} 2437; AVX512DQ-BW-NEXT: movw $9362, %di # imm = 0x2492 2438; AVX512DQ-BW-NEXT: kmovd %edi, %k3 2439; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k3} 2440; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm4, %xmm6 2441; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14,u,u,u,u,u] 2442; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[u,u,u,u,u] 2443; AVX512DQ-BW-NEXT: vpor %xmm8, %xmm9, %xmm8 2444; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero 2445; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,6,12] 2446; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm10, %xmm9 2447; AVX512DQ-BW-NEXT: vmovdqu8 %xmm9, %xmm8 {%k2} 2448; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15,u,u,u,u,u] 2449; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[u,u,u,u,u] 2450; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm4, %xmm4 2451; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero 2452; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,7,13] 2453; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm9, %xmm6 2454; AVX512DQ-BW-NEXT: vmovdqu8 %xmm6, %xmm4 {%k2} 2455; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] 2456; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm5, %xmm9 2457; AVX512DQ-BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} 2458; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1 2459; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm1, %xmm6 2460; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm9[5,6,7] 2461; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128] 2462; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm7, %xmm10 2463; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm0, %xmm9 2464; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7] 2465; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm9, %xmm6 2466; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15] 2467; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm5, %xmm5 2468; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 2469; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] 2470; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128] 2471; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm7, %xmm7 2472; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 2473; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7] 2474; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm0, %xmm0 2475; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rsi) 2476; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rdx) 2477; AVX512DQ-BW-NEXT: vmovdqa %xmm8, (%rcx) 2478; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%r8) 2479; AVX512DQ-BW-NEXT: vmovdqa %xmm6, (%r9) 2480; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rax) 2481; AVX512DQ-BW-NEXT: vzeroupper 2482; AVX512DQ-BW-NEXT: retq 2483; 2484; AVX512DQ-BW-FCP-LABEL: load_i8_stride6_vf16: 2485; AVX512DQ-BW-FCP: # %bb.0: 2486; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 2487; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 2488; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 2489; AVX512DQ-BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 2490; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 2491; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1} 2492; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[u,u,u,u,u] 2493; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 2494; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm4[2,8,14],zero,zero,xmm4[0,6,12,u,u,u,u,u] 2495; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 2496; AVX512DQ-BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm5 2497; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[4,10] 2498; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm7 2499; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero 2500; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6 2501; AVX512DQ-BW-FCP-NEXT: movw $-2048, %di # imm = 0xF800 2502; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 2503; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm6, %xmm3 {%k2} 2504; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[u,u,u,u,u] 2505; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[3,9,15],zero,zero,xmm4[1,7,13,u,u,u,u,u] 2506; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2 2507; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,11] 2508; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero 2509; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm6, %xmm4 2510; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm4, %xmm2 {%k2} 2511; AVX512DQ-BW-FCP-NEXT: movw $9362, %di # imm = 0x2492 2512; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k3 2513; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k3} 2514; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 2515; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14,u,u,u,u,u] 2516; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[u,u,u,u,u] 2517; AVX512DQ-BW-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 2518; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero 2519; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,6,12] 2520; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 2521; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm9, %xmm8 {%k2} 2522; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15,u,u,u,u,u] 2523; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[u,u,u,u,u] 2524; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4 2525; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero 2526; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,7,13] 2527; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 2528; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm6, %xmm4 {%k2} 2529; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14] 2530; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm9 2531; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} 2532; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 2533; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 2534; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm9[5,6,7] 2535; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128] 2536; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm10 2537; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm9 2538; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7] 2539; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 2540; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15] 2541; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm5 2542; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 2543; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] 2544; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128] 2545; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm7 2546; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 2547; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7] 2548; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 2549; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rsi) 2550; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rdx) 2551; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%rcx) 2552; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%r8) 2553; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%r9) 2554; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rax) 2555; AVX512DQ-BW-FCP-NEXT: vzeroupper 2556; AVX512DQ-BW-FCP-NEXT: retq 2557 %wide.vec = load <96 x i8>, ptr %in.vec, align 64 2558 %strided.vec0 = shufflevector <96 x i8> %wide.vec, <96 x i8> poison, <16 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90> 2559 %strided.vec1 = shufflevector <96 x i8> %wide.vec, <96 x i8> poison, <16 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43, i32 49, i32 55, i32 61, i32 67, i32 73, i32 79, i32 85, i32 91> 2560 %strided.vec2 = shufflevector <96 x i8> %wide.vec, <96 x i8> poison, <16 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44, i32 50, i32 56, i32 62, i32 68, i32 74, i32 80, i32 86, i32 92> 2561 %strided.vec3 = shufflevector <96 x i8> %wide.vec, <96 x i8> poison, <16 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45, i32 51, i32 57, i32 63, i32 69, i32 75, i32 81, i32 87, i32 93> 2562 %strided.vec4 = shufflevector <96 x i8> %wide.vec, <96 x i8> poison, <16 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46, i32 52, i32 58, i32 64, i32 70, i32 76, i32 82, i32 88, i32 94> 2563 %strided.vec5 = shufflevector <96 x i8> %wide.vec, <96 x i8> poison, <16 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47, i32 53, i32 59, i32 65, i32 71, i32 77, i32 83, i32 89, i32 95> 2564 store <16 x i8> %strided.vec0, ptr %out.vec0, align 64 2565 store <16 x i8> %strided.vec1, ptr %out.vec1, align 64 2566 store <16 x i8> %strided.vec2, ptr %out.vec2, align 64 2567 store <16 x i8> %strided.vec3, ptr %out.vec3, align 64 2568 store <16 x i8> %strided.vec4, ptr %out.vec4, align 64 2569 store <16 x i8> %strided.vec5, ptr %out.vec5, align 64 2570 ret void 2571} 2572 2573define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { 2574; SSE-LABEL: load_i8_stride6_vf32: 2575; SSE: # %bb.0: 2576; SSE-NEXT: subq $264, %rsp # imm = 0x108 2577; SSE-NEXT: movdqa 64(%rdi), %xmm7 2578; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2579; SSE-NEXT: movdqa 80(%rdi), %xmm9 2580; SSE-NEXT: movdqa (%rdi), %xmm12 2581; SSE-NEXT: movdqa 16(%rdi), %xmm14 2582; SSE-NEXT: movdqa 32(%rdi), %xmm1 2583; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2584; SSE-NEXT: movdqa 48(%rdi), %xmm15 2585; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,0,65535,65535] 2586; SSE-NEXT: movdqa %xmm10, %xmm0 2587; SSE-NEXT: pandn %xmm1, %xmm0 2588; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,0,65535,65535,0] 2589; SSE-NEXT: movdqa %xmm11, %xmm1 2590; SSE-NEXT: pandn %xmm15, %xmm1 2591; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2592; SSE-NEXT: movdqa %xmm10, %xmm1 2593; SSE-NEXT: pandn %xmm15, %xmm1 2594; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2595; SSE-NEXT: pand %xmm10, %xmm15 2596; SSE-NEXT: por %xmm0, %xmm15 2597; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] 2598; SSE-NEXT: movdqa %xmm15, %xmm0 2599; SSE-NEXT: pand %xmm6, %xmm0 2600; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] 2601; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] 2602; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] 2603; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] 2604; SSE-NEXT: packuswb %xmm1, %xmm0 2605; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535] 2606; SSE-NEXT: movdqa %xmm11, %xmm1 2607; SSE-NEXT: pandn %xmm14, %xmm1 2608; SSE-NEXT: movdqa %xmm12, %xmm8 2609; SSE-NEXT: pand %xmm11, %xmm8 2610; SSE-NEXT: por %xmm1, %xmm8 2611; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,1,3] 2612; SSE-NEXT: pand %xmm6, %xmm1 2613; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 2614; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] 2615; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] 2616; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] 2617; SSE-NEXT: packuswb %xmm1, %xmm1 2618; SSE-NEXT: pand %xmm2, %xmm1 2619; SSE-NEXT: movdqa %xmm2, %xmm3 2620; SSE-NEXT: movdqa %xmm2, %xmm5 2621; SSE-NEXT: pandn %xmm0, %xmm3 2622; SSE-NEXT: por %xmm3, %xmm1 2623; SSE-NEXT: movdqa %xmm10, %xmm0 2624; SSE-NEXT: pandn %xmm9, %xmm0 2625; SSE-NEXT: pand %xmm10, %xmm7 2626; SSE-NEXT: por %xmm0, %xmm7 2627; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,1,2,0] 2628; SSE-NEXT: pand %xmm6, %xmm0 2629; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] 2630; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] 2631; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] 2632; SSE-NEXT: packuswb %xmm0, %xmm0 2633; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] 2634; SSE-NEXT: movdqa %xmm3, %xmm2 2635; SSE-NEXT: pandn %xmm0, %xmm2 2636; SSE-NEXT: pand %xmm3, %xmm1 2637; SSE-NEXT: por %xmm1, %xmm2 2638; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2639; SSE-NEXT: movdqa 128(%rdi), %xmm1 2640; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2641; SSE-NEXT: movdqa %xmm10, %xmm0 2642; SSE-NEXT: pandn %xmm1, %xmm0 2643; SSE-NEXT: movdqa 144(%rdi), %xmm1 2644; SSE-NEXT: movdqa %xmm11, %xmm2 2645; SSE-NEXT: pandn %xmm1, %xmm2 2646; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2647; SSE-NEXT: movdqa %xmm10, %xmm2 2648; SSE-NEXT: pandn %xmm1, %xmm2 2649; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2650; SSE-NEXT: movdqa %xmm1, %xmm2 2651; SSE-NEXT: pand %xmm10, %xmm2 2652; SSE-NEXT: por %xmm0, %xmm2 2653; SSE-NEXT: movdqa %xmm2, %xmm0 2654; SSE-NEXT: pand %xmm6, %xmm0 2655; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7] 2656; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] 2657; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] 2658; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] 2659; SSE-NEXT: packuswb %xmm3, %xmm0 2660; SSE-NEXT: movdqa %xmm5, %xmm6 2661; SSE-NEXT: pandn %xmm0, %xmm6 2662; SSE-NEXT: movdqa %xmm10, %xmm1 2663; SSE-NEXT: movdqa %xmm10, %xmm0 2664; SSE-NEXT: pandn %xmm12, %xmm0 2665; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2666; SSE-NEXT: movdqa 112(%rdi), %xmm0 2667; SSE-NEXT: movdqa %xmm11, %xmm3 2668; SSE-NEXT: pandn %xmm0, %xmm3 2669; SSE-NEXT: movdqa 160(%rdi), %xmm5 2670; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2671; SSE-NEXT: pand %xmm10, %xmm5 2672; SSE-NEXT: movdqa %xmm10, %xmm4 2673; SSE-NEXT: pandn %xmm14, %xmm4 2674; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2675; SSE-NEXT: pand %xmm10, %xmm12 2676; SSE-NEXT: movdqa %xmm11, %xmm4 2677; SSE-NEXT: pandn %xmm9, %xmm4 2678; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2679; SSE-NEXT: movdqa %xmm9, %xmm11 2680; SSE-NEXT: pand %xmm10, %xmm11 2681; SSE-NEXT: movdqa %xmm10, %xmm4 2682; SSE-NEXT: pandn %xmm0, %xmm4 2683; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2684; SSE-NEXT: movdqa 96(%rdi), %xmm13 2685; SSE-NEXT: movdqa %xmm13, %xmm4 2686; SSE-NEXT: pand %xmm10, %xmm4 2687; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2688; SSE-NEXT: movdqa 176(%rdi), %xmm4 2689; SSE-NEXT: movdqa %xmm4, %xmm10 2690; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2691; SSE-NEXT: pand %xmm1, %xmm10 2692; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2693; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 2694; SSE-NEXT: movdqa %xmm9, %xmm10 2695; SSE-NEXT: pand %xmm1, %xmm9 2696; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2697; SSE-NEXT: pand %xmm1, %xmm14 2698; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2699; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 2700; SSE-NEXT: movdqa %xmm14, %xmm9 2701; SSE-NEXT: pand %xmm1, %xmm14 2702; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2703; SSE-NEXT: pand %xmm1, %xmm0 2704; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2705; SSE-NEXT: movdqa %xmm1, %xmm14 2706; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2707; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill 2708; SSE-NEXT: pandn %xmm13, %xmm1 2709; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2710; SSE-NEXT: movdqa %xmm13, %xmm1 2711; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2712; SSE-NEXT: por %xmm3, %xmm1 2713; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,1,3] 2714; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] 2715; SSE-NEXT: pand %xmm0, %xmm3 2716; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] 2717; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] 2718; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] 2719; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] 2720; SSE-NEXT: packuswb %xmm3, %xmm3 2721; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 2722; SSE-NEXT: por %xmm6, %xmm3 2723; SSE-NEXT: pandn %xmm4, %xmm14 2724; SSE-NEXT: por %xmm14, %xmm5 2725; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[3,1,2,0] 2726; SSE-NEXT: pand %xmm0, %xmm4 2727; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] 2728; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0] 2729; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] 2730; SSE-NEXT: packuswb %xmm4, %xmm4 2731; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] 2732; SSE-NEXT: movdqa %xmm13, %xmm0 2733; SSE-NEXT: pandn %xmm4, %xmm0 2734; SSE-NEXT: pand %xmm13, %xmm3 2735; SSE-NEXT: por %xmm3, %xmm0 2736; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2737; SSE-NEXT: pxor %xmm4, %xmm4 2738; SSE-NEXT: movdqa %xmm15, %xmm3 2739; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] 2740; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3],xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7] 2741; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm15[2,2,3,3] 2742; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3] 2743; SSE-NEXT: psrld $16, %xmm3 2744; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,0,3] 2745; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,7] 2746; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm3[2],xmm15[3],xmm3[3] 2747; SSE-NEXT: packuswb %xmm15, %xmm14 2748; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,0,65535,65535] 2749; SSE-NEXT: movdqa %xmm6, %xmm3 2750; SSE-NEXT: pandn %xmm14, %xmm3 2751; SSE-NEXT: movdqa %xmm8, %xmm14 2752; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm4[8],xmm14[9],xmm4[9],xmm14[10],xmm4[10],xmm14[11],xmm4[11],xmm14[12],xmm4[12],xmm14[13],xmm4[13],xmm14[14],xmm4[14],xmm14[15],xmm4[15] 2753; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,1,0,3] 2754; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[1,1,1,1,4,5,6,7] 2755; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,7,6,7] 2756; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,0,0,65535,65535] 2757; SSE-NEXT: movdqa %xmm15, %xmm0 2758; SSE-NEXT: pandn %xmm14, %xmm0 2759; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] 2760; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] 2761; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] 2762; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm8[1,3,2,0,4,5,6,7] 2763; SSE-NEXT: pand %xmm15, %xmm14 2764; SSE-NEXT: por %xmm0, %xmm14 2765; SSE-NEXT: packuswb %xmm14, %xmm14 2766; SSE-NEXT: pand %xmm6, %xmm14 2767; SSE-NEXT: por %xmm3, %xmm14 2768; SSE-NEXT: movdqa %xmm7, %xmm0 2769; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 2770; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] 2771; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,65535,0,65535] 2772; SSE-NEXT: movdqa %xmm8, %xmm3 2773; SSE-NEXT: pandn %xmm0, %xmm3 2774; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] 2775; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[3,1,2,3,4,5,6,7] 2776; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 2777; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,4] 2778; SSE-NEXT: pand %xmm8, %xmm0 2779; SSE-NEXT: por %xmm3, %xmm0 2780; SSE-NEXT: packuswb %xmm0, %xmm0 2781; SSE-NEXT: movdqa %xmm13, %xmm3 2782; SSE-NEXT: pandn %xmm0, %xmm3 2783; SSE-NEXT: pand %xmm13, %xmm14 2784; SSE-NEXT: por %xmm14, %xmm3 2785; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2786; SSE-NEXT: movdqa %xmm2, %xmm0 2787; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] 2788; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 2789; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] 2790; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] 2791; SSE-NEXT: psrld $16, %xmm0 2792; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] 2793; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] 2794; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 2795; SSE-NEXT: packuswb %xmm2, %xmm3 2796; SSE-NEXT: movdqa %xmm1, %xmm0 2797; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] 2798; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 2799; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] 2800; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] 2801; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 2802; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 2803; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] 2804; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,0,4,5,6,7] 2805; SSE-NEXT: pand %xmm15, %xmm1 2806; SSE-NEXT: pandn %xmm0, %xmm15 2807; SSE-NEXT: por %xmm1, %xmm15 2808; SSE-NEXT: packuswb %xmm15, %xmm15 2809; SSE-NEXT: pand %xmm6, %xmm15 2810; SSE-NEXT: pandn %xmm3, %xmm6 2811; SSE-NEXT: por %xmm6, %xmm15 2812; SSE-NEXT: movdqa %xmm5, %xmm0 2813; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 2814; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] 2815; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] 2816; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[3,1,2,3,4,5,6,7] 2817; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] 2818; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4] 2819; SSE-NEXT: pand %xmm8, %xmm1 2820; SSE-NEXT: pandn %xmm0, %xmm8 2821; SSE-NEXT: por %xmm1, %xmm8 2822; SSE-NEXT: packuswb %xmm8, %xmm0 2823; SSE-NEXT: movdqa %xmm13, %xmm1 2824; SSE-NEXT: pandn %xmm0, %xmm1 2825; SSE-NEXT: pand %xmm13, %xmm15 2826; SSE-NEXT: movdqa %xmm13, %xmm7 2827; SSE-NEXT: por %xmm15, %xmm1 2828; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2829; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0] 2830; SSE-NEXT: pand %xmm5, %xmm10 2831; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload 2832; SSE-NEXT: movdqa %xmm10, %xmm0 2833; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,255,255,255,255] 2834; SSE-NEXT: pand %xmm15, %xmm0 2835; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] 2836; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2837; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] 2838; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] 2839; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 2840; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,5,6] 2841; SSE-NEXT: packuswb %xmm1, %xmm2 2842; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 2843; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[2,1,2,3,4,5,6,7] 2844; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] 2845; SSE-NEXT: pand %xmm15, %xmm0 2846; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] 2847; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] 2848; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,5,5,5] 2849; SSE-NEXT: packuswb %xmm1, %xmm1 2850; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] 2851; SSE-NEXT: movdqa %xmm0, %xmm3 2852; SSE-NEXT: pandn %xmm1, %xmm3 2853; SSE-NEXT: pand %xmm0, %xmm2 2854; SSE-NEXT: por %xmm2, %xmm3 2855; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 2856; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2857; SSE-NEXT: pandn %xmm14, %xmm1 2858; SSE-NEXT: por %xmm1, %xmm11 2859; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,3,2,3,4,5,6,7] 2860; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 2861; SSE-NEXT: pand %xmm15, %xmm1 2862; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 2863; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] 2864; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] 2865; SSE-NEXT: packuswb %xmm1, %xmm1 2866; SSE-NEXT: movdqa %xmm13, %xmm2 2867; SSE-NEXT: pandn %xmm1, %xmm2 2868; SSE-NEXT: pand %xmm13, %xmm3 2869; SSE-NEXT: por %xmm3, %xmm2 2870; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2871; SSE-NEXT: pand %xmm5, %xmm9 2872; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 2873; SSE-NEXT: movdqa %xmm9, %xmm1 2874; SSE-NEXT: pand %xmm15, %xmm1 2875; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,7,6,7] 2876; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 2877; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7] 2878; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] 2879; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] 2880; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] 2881; SSE-NEXT: packuswb %xmm2, %xmm1 2882; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 2883; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 2884; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[2,1,2,3,4,5,6,7] 2885; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] 2886; SSE-NEXT: pand %xmm15, %xmm2 2887; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] 2888; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] 2889; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] 2890; SSE-NEXT: packuswb %xmm2, %xmm2 2891; SSE-NEXT: movdqa %xmm0, %xmm3 2892; SSE-NEXT: pandn %xmm2, %xmm3 2893; SSE-NEXT: pand %xmm0, %xmm1 2894; SSE-NEXT: por %xmm1, %xmm3 2895; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload 2896; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 2897; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 2898; SSE-NEXT: por %xmm1, %xmm8 2899; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,3,2,3,4,5,6,7] 2900; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 2901; SSE-NEXT: pand %xmm15, %xmm1 2902; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] 2903; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] 2904; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] 2905; SSE-NEXT: packuswb %xmm1, %xmm1 2906; SSE-NEXT: movdqa %xmm7, %xmm2 2907; SSE-NEXT: pandn %xmm1, %xmm2 2908; SSE-NEXT: pand %xmm7, %xmm3 2909; SSE-NEXT: por %xmm3, %xmm2 2910; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2911; SSE-NEXT: movdqa %xmm10, %xmm1 2912; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 2913; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] 2914; SSE-NEXT: movdqa %xmm10, %xmm2 2915; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] 2916; SSE-NEXT: movaps %xmm1, %xmm3 2917; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2] 2918; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm10[0,0] 2919; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[2,3] 2920; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7] 2921; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] 2922; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 2923; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] 2924; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] 2925; SSE-NEXT: packuswb %xmm1, %xmm2 2926; SSE-NEXT: movdqa %xmm12, %xmm1 2927; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 2928; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] 2929; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,1,2,1,4,5,6,7] 2930; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,65535,65535] 2931; SSE-NEXT: movdqa %xmm1, %xmm5 2932; SSE-NEXT: pandn %xmm3, %xmm5 2933; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm4[8],xmm12[9],xmm4[9],xmm12[10],xmm4[10],xmm12[11],xmm4[11],xmm12[12],xmm4[12],xmm12[13],xmm4[13],xmm12[14],xmm4[14],xmm12[15],xmm4[15] 2934; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,3,2,1] 2935; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] 2936; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] 2937; SSE-NEXT: pand %xmm1, %xmm3 2938; SSE-NEXT: por %xmm5, %xmm3 2939; SSE-NEXT: packuswb %xmm3, %xmm3 2940; SSE-NEXT: movdqa %xmm0, %xmm5 2941; SSE-NEXT: pandn %xmm3, %xmm5 2942; SSE-NEXT: pand %xmm0, %xmm2 2943; SSE-NEXT: por %xmm2, %xmm5 2944; SSE-NEXT: movdqa %xmm11, %xmm2 2945; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] 2946; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 2947; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,7,5,6,5] 2948; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,0] 2949; SSE-NEXT: movdqa %xmm2, %xmm6 2950; SSE-NEXT: pandn %xmm3, %xmm6 2951; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] 2952; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,0,3] 2953; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] 2954; SSE-NEXT: pand %xmm2, %xmm3 2955; SSE-NEXT: por %xmm6, %xmm3 2956; SSE-NEXT: packuswb %xmm3, %xmm3 2957; SSE-NEXT: movdqa %xmm7, %xmm6 2958; SSE-NEXT: pandn %xmm3, %xmm6 2959; SSE-NEXT: pand %xmm7, %xmm5 2960; SSE-NEXT: por %xmm5, %xmm6 2961; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2962; SSE-NEXT: movdqa %xmm9, %xmm3 2963; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 2964; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] 2965; SSE-NEXT: movdqa %xmm9, %xmm5 2966; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[3,0] 2967; SSE-NEXT: movaps %xmm3, %xmm6 2968; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,2] 2969; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm9[0,0] 2970; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm9[2,3] 2971; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,7,5,6,7] 2972; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,2] 2973; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] 2974; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] 2975; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] 2976; SSE-NEXT: packuswb %xmm3, %xmm5 2977; SSE-NEXT: movdqa %xmm13, %xmm3 2978; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 2979; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] 2980; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7] 2981; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm4[8],xmm13[9],xmm4[9],xmm13[10],xmm4[10],xmm13[11],xmm4[11],xmm13[12],xmm4[12],xmm13[13],xmm4[13],xmm13[14],xmm4[14],xmm13[15],xmm4[15] 2982; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[0,3,2,1] 2983; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] 2984; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] 2985; SSE-NEXT: pand %xmm1, %xmm6 2986; SSE-NEXT: pandn %xmm3, %xmm1 2987; SSE-NEXT: por %xmm6, %xmm1 2988; SSE-NEXT: pand %xmm0, %xmm5 2989; SSE-NEXT: packuswb %xmm1, %xmm1 2990; SSE-NEXT: pandn %xmm1, %xmm0 2991; SSE-NEXT: por %xmm5, %xmm0 2992; SSE-NEXT: movdqa %xmm8, %xmm1 2993; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] 2994; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] 2995; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] 2996; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] 2997; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,0,3] 2998; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] 2999; SSE-NEXT: pand %xmm2, %xmm3 3000; SSE-NEXT: pandn %xmm1, %xmm2 3001; SSE-NEXT: por %xmm3, %xmm2 3002; SSE-NEXT: movdqa %xmm7, %xmm13 3003; SSE-NEXT: pand %xmm7, %xmm0 3004; SSE-NEXT: packuswb %xmm2, %xmm1 3005; SSE-NEXT: pandn %xmm1, %xmm13 3006; SSE-NEXT: por %xmm0, %xmm13 3007; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 3008; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 3009; SSE-NEXT: movdqa %xmm7, %xmm0 3010; SSE-NEXT: pand %xmm15, %xmm0 3011; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] 3012; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] 3013; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] 3014; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] 3015; SSE-NEXT: packuswb %xmm1, %xmm0 3016; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] 3017; SSE-NEXT: movdqa %xmm2, %xmm1 3018; SSE-NEXT: pandn %xmm0, %xmm1 3019; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 3020; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 3021; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,1,2,0] 3022; SSE-NEXT: pand %xmm15, %xmm0 3023; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] 3024; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 3025; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,1,0,3,4,5,6,7] 3026; SSE-NEXT: packuswb %xmm3, %xmm3 3027; SSE-NEXT: pand %xmm2, %xmm3 3028; SSE-NEXT: por %xmm1, %xmm3 3029; SSE-NEXT: movdqa %xmm14, %xmm11 3030; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0] 3031; SSE-NEXT: pand %xmm12, %xmm11 3032; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 3033; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,2,1,3] 3034; SSE-NEXT: pand %xmm15, %xmm0 3035; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] 3036; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3037; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] 3038; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] 3039; SSE-NEXT: packuswb %xmm0, %xmm5 3040; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] 3041; SSE-NEXT: movdqa %xmm0, %xmm8 3042; SSE-NEXT: pandn %xmm5, %xmm8 3043; SSE-NEXT: pand %xmm0, %xmm3 3044; SSE-NEXT: por %xmm3, %xmm8 3045; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 3046; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 3047; SSE-NEXT: movdqa %xmm14, %xmm3 3048; SSE-NEXT: pand %xmm15, %xmm3 3049; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,1,2,3] 3050; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] 3051; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] 3052; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] 3053; SSE-NEXT: packuswb %xmm5, %xmm3 3054; SSE-NEXT: movdqa %xmm2, %xmm5 3055; SSE-NEXT: pandn %xmm3, %xmm5 3056; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3057; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 3058; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,1,2,0] 3059; SSE-NEXT: pand %xmm15, %xmm3 3060; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] 3061; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] 3062; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[2,1,0,3,4,5,6,7] 3063; SSE-NEXT: packuswb %xmm6, %xmm6 3064; SSE-NEXT: pand %xmm2, %xmm6 3065; SSE-NEXT: por %xmm5, %xmm6 3066; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 3067; SSE-NEXT: pand %xmm12, %xmm3 3068; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 3069; SSE-NEXT: por %xmm3, %xmm12 3070; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,2,1,3] 3071; SSE-NEXT: pand %xmm15, %xmm3 3072; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] 3073; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] 3074; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] 3075; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] 3076; SSE-NEXT: packuswb %xmm3, %xmm5 3077; SSE-NEXT: movdqa %xmm0, %xmm3 3078; SSE-NEXT: pandn %xmm5, %xmm3 3079; SSE-NEXT: pand %xmm0, %xmm6 3080; SSE-NEXT: por %xmm6, %xmm3 3081; SSE-NEXT: movdqa %xmm7, %xmm5 3082; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] 3083; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] 3084; SSE-NEXT: movdqa %xmm7, %xmm6 3085; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm5[0,0] 3086; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[2,3] 3087; SSE-NEXT: psrlq $48, %xmm5 3088; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3089; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] 3090; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[3,1,2,3,4,5,6,7] 3091; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] 3092; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,7] 3093; SSE-NEXT: packuswb %xmm6, %xmm5 3094; SSE-NEXT: movdqa %xmm2, %xmm6 3095; SSE-NEXT: pandn %xmm5, %xmm6 3096; SSE-NEXT: movdqa %xmm9, %xmm5 3097; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] 3098; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] 3099; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,5,5,5,5] 3100; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535] 3101; SSE-NEXT: movdqa %xmm5, %xmm10 3102; SSE-NEXT: pandn %xmm7, %xmm10 3103; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] 3104; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,7,5,6,7] 3105; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] 3106; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm7[3,1,1,2,4,5,6,7] 3107; SSE-NEXT: pand %xmm5, %xmm9 3108; SSE-NEXT: por %xmm10, %xmm9 3109; SSE-NEXT: packuswb %xmm9, %xmm9 3110; SSE-NEXT: pand %xmm2, %xmm9 3111; SSE-NEXT: por %xmm6, %xmm9 3112; SSE-NEXT: movdqa %xmm11, %xmm6 3113; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm4[8],xmm11[9],xmm4[9],xmm11[10],xmm4[10],xmm11[11],xmm4[11],xmm11[12],xmm4[12],xmm11[13],xmm4[13],xmm11[14],xmm4[14],xmm11[15],xmm4[15] 3114; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm11[0,1,2,3,7,5,6,7] 3115; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] 3116; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,5,5,7,4] 3117; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,0] 3118; SSE-NEXT: movdqa %xmm7, %xmm11 3119; SSE-NEXT: pandn %xmm10, %xmm11 3120; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] 3121; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,1,1] 3122; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] 3123; SSE-NEXT: pand %xmm7, %xmm6 3124; SSE-NEXT: por %xmm6, %xmm11 3125; SSE-NEXT: packuswb %xmm11, %xmm10 3126; SSE-NEXT: movdqa %xmm0, %xmm6 3127; SSE-NEXT: pandn %xmm10, %xmm6 3128; SSE-NEXT: pand %xmm0, %xmm9 3129; SSE-NEXT: por %xmm9, %xmm6 3130; SSE-NEXT: movdqa %xmm14, %xmm11 3131; SSE-NEXT: movdqa %xmm14, %xmm9 3132; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] 3133; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] 3134; SSE-NEXT: movdqa %xmm11, %xmm10 3135; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm9[0,0] 3136; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm9[2,3] 3137; SSE-NEXT: psrlq $48, %xmm9 3138; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3139; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] 3140; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm11[3,1,2,3,4,5,6,7] 3141; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] 3142; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,7] 3143; SSE-NEXT: packuswb %xmm10, %xmm9 3144; SSE-NEXT: movdqa %xmm1, %xmm10 3145; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] 3146; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3] 3147; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] 3148; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 3149; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,7,5,6,7] 3150; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] 3151; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,1,2,4,5,6,7] 3152; SSE-NEXT: pand %xmm5, %xmm11 3153; SSE-NEXT: pandn %xmm10, %xmm5 3154; SSE-NEXT: por %xmm11, %xmm5 3155; SSE-NEXT: packuswb %xmm5, %xmm5 3156; SSE-NEXT: pand %xmm2, %xmm5 3157; SSE-NEXT: pandn %xmm9, %xmm2 3158; SSE-NEXT: por %xmm2, %xmm5 3159; SSE-NEXT: movdqa %xmm12, %xmm2 3160; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 3161; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm4[8],xmm12[9],xmm4[9],xmm12[10],xmm4[10],xmm12[11],xmm4[11],xmm12[12],xmm4[12],xmm12[13],xmm4[13],xmm12[14],xmm4[14],xmm12[15],xmm4[15] 3162; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,1,1] 3163; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] 3164; SSE-NEXT: pand %xmm7, %xmm2 3165; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,7,5,6,7] 3166; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] 3167; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,7,4] 3168; SSE-NEXT: pandn %xmm4, %xmm7 3169; SSE-NEXT: por %xmm2, %xmm7 3170; SSE-NEXT: pand %xmm0, %xmm5 3171; SSE-NEXT: packuswb %xmm7, %xmm2 3172; SSE-NEXT: pandn %xmm2, %xmm0 3173; SSE-NEXT: por %xmm5, %xmm0 3174; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 3175; SSE-NEXT: movaps %xmm2, 16(%rsi) 3176; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 3177; SSE-NEXT: movaps %xmm2, (%rsi) 3178; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3179; SSE-NEXT: movaps %xmm1, 16(%rdx) 3180; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3181; SSE-NEXT: movaps %xmm1, (%rdx) 3182; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3183; SSE-NEXT: movaps %xmm1, 16(%rcx) 3184; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3185; SSE-NEXT: movaps %xmm1, (%rcx) 3186; SSE-NEXT: movdqa %xmm13, 16(%r8) 3187; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3188; SSE-NEXT: movaps %xmm1, (%r8) 3189; SSE-NEXT: movdqa %xmm3, 16(%r9) 3190; SSE-NEXT: movdqa %xmm8, (%r9) 3191; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 3192; SSE-NEXT: movdqa %xmm0, 16(%rax) 3193; SSE-NEXT: movdqa %xmm6, (%rax) 3194; SSE-NEXT: addq $264, %rsp # imm = 0x108 3195; SSE-NEXT: retq 3196; 3197; AVX-LABEL: load_i8_stride6_vf32: 3198; AVX: # %bb.0: 3199; AVX-NEXT: subq $120, %rsp 3200; AVX-NEXT: vmovdqa (%rdi), %xmm9 3201; AVX-NEXT: vmovdqa 16(%rdi), %xmm7 3202; AVX-NEXT: vmovdqa 32(%rdi), %xmm6 3203; AVX-NEXT: vmovdqa 48(%rdi), %xmm5 3204; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u] 3205; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,0,6,12,u,u,u,u,u,u,u,u] 3206; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 3207; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm7[4,10,u,u,u,u,u,u,u,u,u,u,u] 3208; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[2,8,14],zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] 3209; AVX-NEXT: vpor %xmm0, %xmm2, %xmm2 3210; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u] 3211; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 3212; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3213; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u] 3214; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,1,7,13,u,u,u,u,u,u,u,u] 3215; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 3216; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm7[5,11,u,u,u,u,u,u,u,u,u,u,u] 3217; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[3,9,15],zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] 3218; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 3219; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 3220; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3221; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] 3222; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm7[0,6,12,u,u,u,u,u,u,u,u,u,u,u] 3223; AVX-NEXT: vpor %xmm0, %xmm1, %xmm1 3224; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] 3225; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] 3226; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] 3227; AVX-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] 3228; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 3229; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3230; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] 3231; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3232; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm7[1,7,13,u,u,u,u,u,u,u,u,u,u,u] 3233; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 3234; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3235; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] 3236; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3237; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] 3238; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] 3239; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 3240; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3241; AVX-NEXT: vmovq {{.*#+}} xmm8 = [128,128,128,2,8,14,0,0,0,0,0,0,0,0,0,0] 3242; AVX-NEXT: vmovdqa 112(%rdi), %xmm0 3243; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 3244; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm1 3245; AVX-NEXT: vmovq {{.*#+}} xmm2 = [0,6,12,128,128,128,0,0,0,0,0,0,0,0,0,0] 3246; AVX-NEXT: vmovdqa 96(%rdi), %xmm13 3247; AVX-NEXT: vpshufb %xmm2, %xmm13, %xmm3 3248; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 3249; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10] 3250; AVX-NEXT: # xmm11 = mem[0,0] 3251; AVX-NEXT: vmovdqa 80(%rdi), %xmm12 3252; AVX-NEXT: vpshufb %xmm11, %xmm12, %xmm4 3253; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128] 3254; AVX-NEXT: # xmm3 = mem[0,0] 3255; AVX-NEXT: vmovdqa 64(%rdi), %xmm14 3256; AVX-NEXT: vpshufb %xmm3, %xmm14, %xmm10 3257; AVX-NEXT: vpor %xmm4, %xmm10, %xmm4 3258; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 3259; AVX-NEXT: vmovd {{.*#+}} xmm15 = [0,0,4,10,0,0,0,0,0,0,0,0,0,0,0,0] 3260; AVX-NEXT: vpshufb %xmm15, %xmm6, %xmm4 3261; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,6,12,0,0,6,12,0,0,6,12,0,0,6,12,0] 3262; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm10 3263; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] 3264; AVX-NEXT: vpshufb %xmm8, %xmm7, %xmm8 3265; AVX-NEXT: vmovdqa %xmm7, %xmm10 3266; AVX-NEXT: vpshufb %xmm2, %xmm9, %xmm2 3267; AVX-NEXT: vpor %xmm2, %xmm8, %xmm2 3268; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5],xmm2[6,7] 3269; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] 3270; AVX-NEXT: vandnps %ymm1, %ymm4, %ymm1 3271; AVX-NEXT: vandps %ymm4, %ymm2, %ymm2 3272; AVX-NEXT: vorps %ymm1, %ymm2, %ymm8 3273; AVX-NEXT: vmovdqa 128(%rdi), %xmm6 3274; AVX-NEXT: vpshufb %xmm15, %xmm6, %xmm1 3275; AVX-NEXT: vmovdqa 144(%rdi), %xmm5 3276; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm0 3277; AVX-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3278; AVX-NEXT: vmovdqa 176(%rdi), %xmm4 3279; AVX-NEXT: vpshufb %xmm11, %xmm4, %xmm11 3280; AVX-NEXT: vmovdqa 160(%rdi), %xmm2 3281; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm3 3282; AVX-NEXT: vpor %xmm3, %xmm11, %xmm11 3283; AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] 3284; AVX-NEXT: vpblendvb %xmm3, %xmm15, %xmm11, %xmm15 3285; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] 3286; AVX-NEXT: vandps %ymm11, %ymm8, %ymm8 3287; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 3288; AVX-NEXT: vandnps %ymm15, %ymm11, %ymm15 3289; AVX-NEXT: vorps %ymm15, %ymm8, %ymm0 3290; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3291; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 3292; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm9[3,9,15,u,u,u,u,u,u,u,u,u,u] 3293; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[1,7,13],zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u] 3294; AVX-NEXT: vpor %xmm0, %xmm15, %xmm1 3295; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[5,11] 3296; AVX-NEXT: vmovdqa %xmm14, %xmm7 3297; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero 3298; AVX-NEXT: vpor %xmm0, %xmm15, %xmm0 3299; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 3300; AVX-NEXT: vmovd {{.*#+}} xmm8 = [0,0,5,11,0,0,0,0,0,0,0,0,0,0,0,0] 3301; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 3302; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm15 3303; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0] 3304; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 3305; AVX-NEXT: vpshufb %xmm1, %xmm14, %xmm14 3306; AVX-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] 3307; AVX-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm10[3,9,15,u,u,u,u,u,u,u,u,u,u] 3308; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 3309; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] 3310; AVX-NEXT: vpor %xmm15, %xmm10, %xmm10 3311; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3,4,5],xmm10[6,7] 3312; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] 3313; AVX-NEXT: vandnps %ymm0, %ymm14, %ymm0 3314; AVX-NEXT: vandps %ymm14, %ymm10, %ymm10 3315; AVX-NEXT: vorps %ymm0, %ymm10, %ymm0 3316; AVX-NEXT: vpshufb %xmm8, %xmm6, %xmm8 3317; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm1 3318; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] 3319; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[5,11] 3320; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero 3321; AVX-NEXT: vpor %xmm8, %xmm10, %xmm8 3322; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm8, %xmm1 3323; AVX-NEXT: vandps %ymm0, %ymm11, %ymm0 3324; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 3325; AVX-NEXT: vandnps %ymm1, %ymm11, %ymm1 3326; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 3327; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3328; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm9[4,10,u,u,u,u,u,u,u,u,u,u,u] 3329; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[2,8,14],zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u,u] 3330; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 3331; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,4,10,128,128,128,0,0,0,4,10,128,128,128] 3332; AVX-NEXT: # xmm1 = mem[0,0] 3333; AVX-NEXT: vpshufb %xmm1, %xmm7, %xmm10 3334; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,0,6,12,0,0,0,128,128,0,6,12] 3335; AVX-NEXT: # xmm11 = mem[0,0] 3336; AVX-NEXT: vpshufb %xmm11, %xmm12, %xmm14 3337; AVX-NEXT: vpor %xmm10, %xmm14, %xmm10 3338; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm0 3339; AVX-NEXT: vandnps %ymm0, %ymm3, %ymm0 3340; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload 3341; AVX-NEXT: vorps %ymm0, %ymm10, %ymm0 3342; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u] 3343; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,0,6,12,u,u,u,u,u,u,u,u] 3344; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm14[0],xmm10[0] 3345; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm1 3346; AVX-NEXT: vpshufb %xmm11, %xmm4, %xmm11 3347; AVX-NEXT: vpor %xmm1, %xmm11, %xmm1 3348; AVX-NEXT: vpblendvb %xmm3, %xmm10, %xmm1, %xmm1 3349; AVX-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] 3350; AVX-NEXT: vandps %ymm0, %ymm10, %ymm0 3351; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 3352; AVX-NEXT: vandnps %ymm1, %ymm10, %ymm1 3353; AVX-NEXT: vorps %ymm1, %ymm0, %ymm11 3354; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm9[5,11,u,u,u,u,u,u,u,u,u,u,u] 3355; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[3,9,15],zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u,u] 3356; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 3357; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,5,11,128,128,128,0,0,0,5,11,128,128,128] 3358; AVX-NEXT: # xmm1 = mem[0,0] 3359; AVX-NEXT: vpshufb %xmm1, %xmm7, %xmm14 3360; AVX-NEXT: vmovddup {{.*#+}} xmm15 = [0,0,0,128,128,1,7,13,0,0,0,128,128,1,7,13] 3361; AVX-NEXT: # xmm15 = mem[0,0] 3362; AVX-NEXT: vpshufb %xmm15, %xmm12, %xmm8 3363; AVX-NEXT: vpor %xmm14, %xmm8, %xmm8 3364; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0 3365; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u] 3366; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,1,7,13,u,u,u,u,u,u,u,u] 3367; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm14[0],xmm8[0] 3368; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm1 3369; AVX-NEXT: vpshufb %xmm15, %xmm4, %xmm14 3370; AVX-NEXT: vpor %xmm1, %xmm14, %xmm1 3371; AVX-NEXT: vpblendvb %xmm3, %xmm8, %xmm1, %xmm1 3372; AVX-NEXT: vandnps %ymm0, %ymm3, %ymm0 3373; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 3374; AVX-NEXT: vorps %ymm0, %ymm3, %ymm0 3375; AVX-NEXT: vandps %ymm0, %ymm10, %ymm0 3376; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 3377; AVX-NEXT: vandnps %ymm1, %ymm10, %ymm1 3378; AVX-NEXT: vorps %ymm1, %ymm0, %ymm3 3379; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[4,10],zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u,u] 3380; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm9[0,6,12,u,u,u,u,u,u,u,u,u,u,u] 3381; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 3382; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,128,128,128,2,8,14,0,0,128,128,128,2,8,14] 3383; AVX-NEXT: # xmm1 = mem[0,0] 3384; AVX-NEXT: vpshufb %xmm1, %xmm12, %xmm8 3385; AVX-NEXT: vmovddup {{.*#+}} xmm14 = [0,0,0,6,12,128,128,128,0,0,0,6,12,128,128,128] 3386; AVX-NEXT: # xmm14 = mem[0,0] 3387; AVX-NEXT: vpshufb %xmm14, %xmm7, %xmm15 3388; AVX-NEXT: vpor %xmm8, %xmm15, %xmm8 3389; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0 3390; AVX-NEXT: vmovaps {{.*#+}} ymm8 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 3391; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm15 # 32-byte Folded Reload 3392; AVX-NEXT: vandps %ymm0, %ymm8, %ymm0 3393; AVX-NEXT: vorps %ymm0, %ymm15, %ymm0 3394; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm1 3395; AVX-NEXT: vpshufb %xmm14, %xmm2, %xmm14 3396; AVX-NEXT: vpor %xmm1, %xmm14, %xmm1 3397; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] 3398; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] 3399; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm15[1],xmm14[1] 3400; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2,3,4],xmm1[5,6,7] 3401; AVX-NEXT: vandps %ymm0, %ymm10, %ymm0 3402; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 3403; AVX-NEXT: vandnps %ymm1, %ymm10, %ymm1 3404; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 3405; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[5,11],zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u,u] 3406; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[1,7,13,u,u,u,u,u,u,u,u,u,u,u] 3407; AVX-NEXT: vpor %xmm1, %xmm9, %xmm1 3408; AVX-NEXT: vmovddup {{.*#+}} xmm9 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15] 3409; AVX-NEXT: # xmm9 = mem[0,0] 3410; AVX-NEXT: vpshufb %xmm9, %xmm12, %xmm12 3411; AVX-NEXT: vmovddup {{.*#+}} xmm13 = [0,0,1,7,13,128,128,128,0,0,1,7,13,128,128,128] 3412; AVX-NEXT: # xmm13 = mem[0,0] 3413; AVX-NEXT: vpshufb %xmm13, %xmm7, %xmm7 3414; AVX-NEXT: vpor %xmm7, %xmm12, %xmm7 3415; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 3416; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload 3417; AVX-NEXT: vandps %ymm1, %ymm8, %ymm1 3418; AVX-NEXT: vorps %ymm7, %ymm1, %ymm1 3419; AVX-NEXT: vpshufb %xmm9, %xmm4, %xmm4 3420; AVX-NEXT: vpshufb %xmm13, %xmm2, %xmm2 3421; AVX-NEXT: vpor %xmm4, %xmm2, %xmm2 3422; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u] 3423; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15] 3424; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm5[1],xmm4[1] 3425; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3,4],xmm2[5,6,7] 3426; AVX-NEXT: vandps %ymm1, %ymm10, %ymm1 3427; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 3428; AVX-NEXT: vandnps %ymm2, %ymm10, %ymm2 3429; AVX-NEXT: vorps %ymm2, %ymm1, %ymm1 3430; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 3431; AVX-NEXT: vmovaps %ymm2, (%rsi) 3432; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 3433; AVX-NEXT: vmovaps %ymm2, (%rdx) 3434; AVX-NEXT: vmovaps %ymm11, (%rcx) 3435; AVX-NEXT: vmovaps %ymm3, (%r8) 3436; AVX-NEXT: vmovaps %ymm0, (%r9) 3437; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 3438; AVX-NEXT: vmovaps %ymm1, (%rax) 3439; AVX-NEXT: addq $120, %rsp 3440; AVX-NEXT: vzeroupper 3441; AVX-NEXT: retq 3442; 3443; AVX2-LABEL: load_i8_stride6_vf32: 3444; AVX2: # %bb.0: 3445; AVX2-NEXT: vmovdqa 160(%rdi), %ymm4 3446; AVX2-NEXT: vmovdqa (%rdi), %ymm2 3447; AVX2-NEXT: vmovdqa 32(%rdi), %ymm3 3448; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 3449; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 3450; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] 3451; AVX2-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm9 3452; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u] 3453; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm10 3454; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm10[2,8,14],zero,zero,xmm10[0,6,12,u,u,u,u,u] 3455; AVX2-NEXT: vpor %xmm5, %xmm6, %xmm11 3456; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] 3457; AVX2-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1] 3458; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] 3459; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm1 3460; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] 3461; AVX2-NEXT: vpmovsxdq {{.*#+}} xmm7 = [18446744073709551615,16777215] 3462; AVX2-NEXT: vpblendvb %ymm7, %ymm11, %ymm0, %ymm0 3463; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,7,13],zero,zero,zero,xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u] 3464; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[3,9,15],zero,zero,xmm10[1,7,13,u,u,u,u,u] 3465; AVX2-NEXT: vpor %xmm9, %xmm10, %xmm9 3466; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] 3467; AVX2-NEXT: vpblendvb %ymm7, %ymm9, %ymm1, %ymm1 3468; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] 3469; AVX2-NEXT: vpblendvb %ymm11, %ymm3, %ymm2, %ymm9 3470; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm10 3471; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] 3472; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[2,8,14],zero,zero,xmm9[0,6,12],zero,zero,zero,xmm9[u,u,u,u,u] 3473; AVX2-NEXT: vpor %xmm12, %xmm13, %xmm12 3474; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0] 3475; AVX2-NEXT: vpblendvb %ymm13, %ymm6, %ymm5, %ymm13 3476; AVX2-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] 3477; AVX2-NEXT: vpblendvb %ymm7, %ymm12, %ymm14, %ymm12 3478; AVX2-NEXT: vmovdqa 128(%rdi), %ymm14 3479; AVX2-NEXT: vpblendvb %ymm8, %ymm14, %ymm4, %ymm8 3480; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u] 3481; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[3,9,15],zero,zero,xmm9[1,7,13],zero,zero,zero,xmm9[u,u,u,u,u] 3482; AVX2-NEXT: vpor %xmm10, %xmm9, %xmm9 3483; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,0,6,12],zero,zero,zero,xmm8[4,10],zero,zero,zero 3484; AVX2-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] 3485; AVX2-NEXT: vpblendvb %ymm7, %ymm9, %ymm13, %ymm13 3486; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 3487; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12] 3488; AVX2-NEXT: vpor %xmm7, %xmm10, %xmm7 3489; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 3490; AVX2-NEXT: vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 3491; AVX2-NEXT: vpblendvb %ymm10, %ymm12, %ymm7, %ymm7 3492; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] 3493; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,1,7,13],zero,zero,zero,xmm8[5,11],zero,zero,zero 3494; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13] 3495; AVX2-NEXT: vpor %xmm8, %xmm9, %xmm8 3496; AVX2-NEXT: vpblendvb %ymm12, %ymm4, %ymm14, %ymm9 3497; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 3498; AVX2-NEXT: vpblendvb %ymm10, %ymm13, %ymm8, %ymm8 3499; AVX2-NEXT: vpblendvb %ymm11, %ymm4, %ymm14, %ymm4 3500; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm11 3501; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14] 3502; AVX2-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero 3503; AVX2-NEXT: vpor %xmm13, %xmm14, %xmm13 3504; AVX2-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm2 3505; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 3506; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] 3507; AVX2-NEXT: vpblendvb %ymm12, %ymm6, %ymm5, %ymm5 3508; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] 3509; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,10],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[u,u,u,u,u,u] 3510; AVX2-NEXT: vpor %xmm6, %xmm12, %xmm6 3511; AVX2-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] 3512; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm12[5,6,7] 3513; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] 3514; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm12 3515; AVX2-NEXT: vpblendvb %ymm10, %ymm6, %ymm12, %ymm6 3516; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15] 3517; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero 3518; AVX2-NEXT: vpor %xmm4, %xmm11, %xmm4 3519; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] 3520; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,11],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[u,u,u,u,u,u] 3521; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 3522; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] 3523; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] 3524; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 3525; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm3 3526; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 3527; AVX2-NEXT: vpblendvb %ymm10, %ymm2, %ymm4, %ymm2 3528; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10] 3529; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,4,10],zero,zero,zero,xmm9[2,8,14],zero,zero 3530; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 3531; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 3532; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] 3533; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] 3534; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11] 3535; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,5,11],zero,zero,zero,xmm9[3,9,15],zero,zero 3536; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 3537; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 3538; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] 3539; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 3540; AVX2-NEXT: vmovdqa %ymm0, (%rsi) 3541; AVX2-NEXT: vmovdqa %ymm1, (%rdx) 3542; AVX2-NEXT: vmovdqa %ymm7, (%rcx) 3543; AVX2-NEXT: vmovdqa %ymm8, (%r8) 3544; AVX2-NEXT: vmovdqa %ymm6, (%r9) 3545; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 3546; AVX2-NEXT: vmovdqa %ymm2, (%rax) 3547; AVX2-NEXT: vzeroupper 3548; AVX2-NEXT: retq 3549; 3550; AVX2-FP-LABEL: load_i8_stride6_vf32: 3551; AVX2-FP: # %bb.0: 3552; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm4 3553; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm2 3554; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm3 3555; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 3556; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1 3557; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] 3558; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm9 3559; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u] 3560; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm10 3561; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm10[2,8,14],zero,zero,xmm10[0,6,12,u,u,u,u,u] 3562; AVX2-FP-NEXT: vpor %xmm5, %xmm6, %xmm11 3563; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] 3564; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1] 3565; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] 3566; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm1 3567; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] 3568; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} xmm7 = [18446744073709551615,16777215] 3569; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm11, %ymm0, %ymm0 3570; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,7,13],zero,zero,zero,xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u] 3571; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[3,9,15],zero,zero,xmm10[1,7,13,u,u,u,u,u] 3572; AVX2-FP-NEXT: vpor %xmm9, %xmm10, %xmm9 3573; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] 3574; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm9, %ymm1, %ymm1 3575; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] 3576; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm3, %ymm2, %ymm9 3577; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm10 3578; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] 3579; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[2,8,14],zero,zero,xmm9[0,6,12],zero,zero,zero,xmm9[u,u,u,u,u] 3580; AVX2-FP-NEXT: vpor %xmm12, %xmm13, %xmm12 3581; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0] 3582; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm6, %ymm5, %ymm13 3583; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] 3584; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm12, %ymm14, %ymm12 3585; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm14 3586; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm14, %ymm4, %ymm8 3587; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u] 3588; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[3,9,15],zero,zero,xmm9[1,7,13],zero,zero,zero,xmm9[u,u,u,u,u] 3589; AVX2-FP-NEXT: vpor %xmm10, %xmm9, %xmm9 3590; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,0,6,12],zero,zero,zero,xmm8[4,10],zero,zero,zero 3591; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] 3592; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm9, %ymm13, %ymm13 3593; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9 3594; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12] 3595; AVX2-FP-NEXT: vpor %xmm7, %xmm10, %xmm7 3596; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 3597; AVX2-FP-NEXT: vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 3598; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm12, %ymm7, %ymm7 3599; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] 3600; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,1,7,13],zero,zero,zero,xmm8[5,11],zero,zero,zero 3601; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13] 3602; AVX2-FP-NEXT: vpor %xmm8, %xmm9, %xmm8 3603; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm4, %ymm14, %ymm9 3604; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 3605; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm13, %ymm8, %ymm8 3606; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm4, %ymm14, %ymm4 3607; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm11 3608; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14] 3609; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero 3610; AVX2-FP-NEXT: vpor %xmm13, %xmm14, %xmm13 3611; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm2 3612; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 3613; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] 3614; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm6, %ymm5, %ymm5 3615; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] 3616; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,10],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[u,u,u,u,u,u] 3617; AVX2-FP-NEXT: vpor %xmm6, %xmm12, %xmm6 3618; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] 3619; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm12[5,6,7] 3620; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] 3621; AVX2-FP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm12 3622; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm6, %ymm12, %ymm6 3623; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15] 3624; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero 3625; AVX2-FP-NEXT: vpor %xmm4, %xmm11, %xmm4 3626; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] 3627; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,11],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[u,u,u,u,u,u] 3628; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 3629; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] 3630; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] 3631; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 3632; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm3 3633; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 3634; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm2, %ymm4, %ymm2 3635; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10] 3636; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,4,10],zero,zero,zero,xmm9[2,8,14],zero,zero 3637; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 3638; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 3639; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] 3640; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] 3641; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11] 3642; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,5,11],zero,zero,zero,xmm9[3,9,15],zero,zero 3643; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3 3644; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 3645; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] 3646; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 3647; AVX2-FP-NEXT: vmovdqa %ymm0, (%rsi) 3648; AVX2-FP-NEXT: vmovdqa %ymm1, (%rdx) 3649; AVX2-FP-NEXT: vmovdqa %ymm7, (%rcx) 3650; AVX2-FP-NEXT: vmovdqa %ymm8, (%r8) 3651; AVX2-FP-NEXT: vmovdqa %ymm6, (%r9) 3652; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3653; AVX2-FP-NEXT: vmovdqa %ymm2, (%rax) 3654; AVX2-FP-NEXT: vzeroupper 3655; AVX2-FP-NEXT: retq 3656; 3657; AVX2-FCP-LABEL: load_i8_stride6_vf32: 3658; AVX2-FCP: # %bb.0: 3659; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 3660; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 3661; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 3662; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 3663; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 3664; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] 3665; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm9 3666; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u] 3667; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 3668; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm10[2,8,14],zero,zero,xmm10[0,6,12,u,u,u,u,u] 3669; AVX2-FCP-NEXT: vpor %xmm5, %xmm6, %xmm11 3670; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] 3671; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1] 3672; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] 3673; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm1 3674; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] 3675; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} xmm7 = [18446744073709551615,16777215] 3676; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm11, %ymm0, %ymm0 3677; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,7,13],zero,zero,zero,xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u] 3678; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[3,9,15],zero,zero,xmm10[1,7,13,u,u,u,u,u] 3679; AVX2-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 3680; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] 3681; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm9, %ymm1, %ymm1 3682; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] 3683; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm3, %ymm2, %ymm9 3684; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 3685; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u] 3686; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[2,8,14],zero,zero,xmm9[0,6,12],zero,zero,zero,xmm9[u,u,u,u,u] 3687; AVX2-FCP-NEXT: vpor %xmm12, %xmm13, %xmm12 3688; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0] 3689; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm6, %ymm5, %ymm13 3690; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] 3691; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm12, %ymm14, %ymm12 3692; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm14 3693; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm14, %ymm4, %ymm8 3694; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u] 3695; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[3,9,15],zero,zero,xmm9[1,7,13],zero,zero,zero,xmm9[u,u,u,u,u] 3696; AVX2-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9 3697; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,0,6,12],zero,zero,zero,xmm8[4,10],zero,zero,zero 3698; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] 3699; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm9, %ymm13, %ymm13 3700; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 3701; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12] 3702; AVX2-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 3703; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 3704; AVX2-FCP-NEXT: vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 3705; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm12, %ymm7, %ymm7 3706; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] 3707; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,1,7,13],zero,zero,zero,xmm8[5,11],zero,zero,zero 3708; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13] 3709; AVX2-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 3710; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm4, %ymm14, %ymm9 3711; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 3712; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm13, %ymm8, %ymm8 3713; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm4, %ymm14, %ymm4 3714; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm11 3715; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14] 3716; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero 3717; AVX2-FCP-NEXT: vpor %xmm13, %xmm14, %xmm13 3718; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm2 3719; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 3720; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] 3721; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm6, %ymm5, %ymm5 3722; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] 3723; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,10],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[u,u,u,u,u,u] 3724; AVX2-FCP-NEXT: vpor %xmm6, %xmm12, %xmm6 3725; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] 3726; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm12[5,6,7] 3727; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] 3728; AVX2-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm12 3729; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm6, %ymm12, %ymm6 3730; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15] 3731; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero 3732; AVX2-FCP-NEXT: vpor %xmm4, %xmm11, %xmm4 3733; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] 3734; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,11],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[u,u,u,u,u,u] 3735; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 3736; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] 3737; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] 3738; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 3739; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm3 3740; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 3741; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm2, %ymm4, %ymm2 3742; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10] 3743; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,4,10],zero,zero,zero,xmm9[2,8,14],zero,zero 3744; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 3745; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 3746; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] 3747; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] 3748; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11] 3749; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,5,11],zero,zero,zero,xmm9[3,9,15],zero,zero 3750; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 3751; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 3752; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] 3753; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 3754; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rsi) 3755; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rdx) 3756; AVX2-FCP-NEXT: vmovdqa %ymm7, (%rcx) 3757; AVX2-FCP-NEXT: vmovdqa %ymm8, (%r8) 3758; AVX2-FCP-NEXT: vmovdqa %ymm6, (%r9) 3759; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3760; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rax) 3761; AVX2-FCP-NEXT: vzeroupper 3762; AVX2-FCP-NEXT: retq 3763; 3764; AVX512-LABEL: load_i8_stride6_vf32: 3765; AVX512: # %bb.0: 3766; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 3767; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] 3768; AVX512-NEXT: vmovdqa64 (%rdi), %ymm17 3769; AVX512-NEXT: vmovdqa 32(%rdi), %ymm3 3770; AVX512-NEXT: vmovdqa 64(%rdi), %ymm1 3771; AVX512-NEXT: vmovdqa 128(%rdi), %ymm6 3772; AVX512-NEXT: vmovdqa %ymm0, %ymm7 3773; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm3 ^ (ymm7 & (ymm17 ^ ymm3)) 3774; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] 3775; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 3776; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u] 3777; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4 3778; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],mem[2,3] 3779; AVX512-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1 3780; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] 3781; AVX512-NEXT: vmovdqa %ymm9, %ymm10 3782; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm10 & (ymm1 ^ ymm5)) 3783; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3784; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] 3785; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm4 & ymm16) 3786; AVX512-NEXT: vmovdqa 160(%rdi), %ymm13 3787; AVX512-NEXT: vmovdqa %ymm0, %ymm14 3788; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm6 ^ (ymm14 & (ymm13 ^ ymm6)) 3789; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm15 3790; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10] 3791; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero 3792; AVX512-NEXT: vpor %xmm4, %xmm12, %xmm4 3793; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 3794; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7],ymm11[8,9,10],ymm4[11,12,13,14,15] 3795; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm4[4,5,6,7] 3796; AVX512-NEXT: vmovdqa64 %ymm2, %ymm18 3797; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u] 3798; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u] 3799; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm7 3800; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3801; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm16) 3802; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11] 3803; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero 3804; AVX512-NEXT: vpor %xmm7, %xmm10, %xmm7 3805; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 3806; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] 3807; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] 3808; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] 3809; AVX512-NEXT: vmovdqa %ymm8, %ymm10 3810; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm17 ^ (ymm10 & (ymm3 ^ ymm17)) 3811; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11 3812; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] 3813; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] 3814; AVX512-NEXT: vpor %xmm12, %xmm14, %xmm12 3815; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm5 ^ ymm1)) 3816; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] 3817; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] 3818; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm12 & ymm16) 3819; AVX512-NEXT: vmovdqa %ymm0, %ymm12 3820; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm13 ^ (ymm12 & (ymm6 ^ ymm13)) 3821; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero 3822; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm2 3823; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12] 3824; AVX512-NEXT: vpor %xmm4, %xmm15, %xmm4 3825; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 3826; AVX512-NEXT: vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] 3827; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm15 & (ymm4 ^ ymm14)) 3828; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] 3829; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] 3830; AVX512-NEXT: vpor %xmm11, %xmm10, %xmm10 3831; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] 3832; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm10 & ymm16) 3833; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero 3834; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13] 3835; AVX512-NEXT: vpor %xmm2, %xmm10, %xmm2 3836; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 3837; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm15 & (ymm2 ^ ymm9)) 3838; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm13 ^ ymm6)) 3839; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm6 3840; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14] 3841; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero 3842; AVX512-NEXT: vpor %xmm9, %xmm10, %xmm9 3843; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 3844; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm3 ^ ymm17)) 3845; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3 3846; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] 3847; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] 3848; AVX512-NEXT: vpor %xmm10, %xmm11, %xmm10 3849; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm5)) 3850; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] 3851; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7] 3852; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] 3853; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm9 ^ (ymm15 & (ymm5 ^ ymm9)) 3854; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15] 3855; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero 3856; AVX512-NEXT: vpor %xmm6, %xmm8, %xmm6 3857; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 3858; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] 3859; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u] 3860; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0 3861; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] 3862; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] 3863; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3864; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm6 ^ (ymm15 & (ymm0 ^ ymm6)) 3865; AVX512-NEXT: vmovdqa64 %ymm18, (%rsi) 3866; AVX512-NEXT: vmovdqa %ymm7, (%rdx) 3867; AVX512-NEXT: vmovdqa %ymm4, (%rcx) 3868; AVX512-NEXT: vmovdqa %ymm2, (%r8) 3869; AVX512-NEXT: vmovdqa %ymm5, (%r9) 3870; AVX512-NEXT: vmovdqa %ymm0, (%rax) 3871; AVX512-NEXT: vzeroupper 3872; AVX512-NEXT: retq 3873; 3874; AVX512-FCP-LABEL: load_i8_stride6_vf32: 3875; AVX512-FCP: # %bb.0: 3876; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 3877; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] 3878; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm17 3879; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 3880; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 3881; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 3882; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm7 3883; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm3 ^ (ymm7 & (ymm17 ^ ymm3)) 3884; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] 3885; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 3886; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u] 3887; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 3888; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],mem[2,3] 3889; AVX512-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1 3890; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] 3891; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm10 3892; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm10 & (ymm1 ^ ymm5)) 3893; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3894; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] 3895; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm4 & ymm16) 3896; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 3897; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm14 3898; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm6 ^ (ymm14 & (ymm13 ^ ymm6)) 3899; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 3900; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10] 3901; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero 3902; AVX512-FCP-NEXT: vpor %xmm4, %xmm12, %xmm4 3903; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 3904; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7],ymm11[8,9,10],ymm4[11,12,13,14,15] 3905; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm4[4,5,6,7] 3906; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm18 3907; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u] 3908; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u] 3909; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 3910; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 3911; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm16) 3912; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11] 3913; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero 3914; AVX512-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 3915; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 3916; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] 3917; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] 3918; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] 3919; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm10 3920; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm17 ^ (ymm10 & (ymm3 ^ ymm17)) 3921; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 3922; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] 3923; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] 3924; AVX512-FCP-NEXT: vpor %xmm12, %xmm14, %xmm12 3925; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm5 ^ ymm1)) 3926; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] 3927; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] 3928; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm12 & ymm16) 3929; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm12 3930; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm13 ^ (ymm12 & (ymm6 ^ ymm13)) 3931; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero 3932; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm2 3933; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12] 3934; AVX512-FCP-NEXT: vpor %xmm4, %xmm15, %xmm4 3935; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 3936; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] 3937; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm15 & (ymm4 ^ ymm14)) 3938; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] 3939; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] 3940; AVX512-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 3941; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] 3942; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm10 & ymm16) 3943; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero 3944; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13] 3945; AVX512-FCP-NEXT: vpor %xmm2, %xmm10, %xmm2 3946; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 3947; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm15 & (ymm2 ^ ymm9)) 3948; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm13 ^ ymm6)) 3949; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm6 3950; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14] 3951; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero 3952; AVX512-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 3953; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 3954; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm3 ^ ymm17)) 3955; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 3956; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] 3957; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] 3958; AVX512-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 3959; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm5)) 3960; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] 3961; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7] 3962; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] 3963; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm9 ^ (ymm15 & (ymm5 ^ ymm9)) 3964; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15] 3965; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero 3966; AVX512-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6 3967; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 3968; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] 3969; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u] 3970; AVX512-FCP-NEXT: vpor %xmm3, %xmm0, %xmm0 3971; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] 3972; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] 3973; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3974; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm6 ^ (ymm15 & (ymm0 ^ ymm6)) 3975; AVX512-FCP-NEXT: vmovdqa64 %ymm18, (%rsi) 3976; AVX512-FCP-NEXT: vmovdqa %ymm7, (%rdx) 3977; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rcx) 3978; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r8) 3979; AVX512-FCP-NEXT: vmovdqa %ymm5, (%r9) 3980; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax) 3981; AVX512-FCP-NEXT: vzeroupper 3982; AVX512-FCP-NEXT: retq 3983; 3984; AVX512DQ-LABEL: load_i8_stride6_vf32: 3985; AVX512DQ: # %bb.0: 3986; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 3987; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] 3988; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm17 3989; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm3 3990; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm1 3991; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm6 3992; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm7 3993; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm3 ^ (ymm7 & (ymm17 ^ ymm3)) 3994; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] 3995; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8 3996; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u] 3997; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4 3998; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],mem[2,3] 3999; AVX512DQ-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1 4000; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] 4001; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm10 4002; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm10 & (ymm1 ^ ymm5)) 4003; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4004; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] 4005; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm4 & ymm16) 4006; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm13 4007; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm14 4008; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm6 ^ (ymm14 & (ymm13 ^ ymm6)) 4009; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm15 4010; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10] 4011; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero 4012; AVX512DQ-NEXT: vpor %xmm4, %xmm12, %xmm4 4013; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 4014; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7],ymm11[8,9,10],ymm4[11,12,13,14,15] 4015; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm4[4,5,6,7] 4016; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm18 4017; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u] 4018; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u] 4019; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm7 4020; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4021; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm16) 4022; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11] 4023; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero 4024; AVX512DQ-NEXT: vpor %xmm7, %xmm10, %xmm7 4025; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 4026; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] 4027; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] 4028; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] 4029; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm10 4030; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm17 ^ (ymm10 & (ymm3 ^ ymm17)) 4031; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11 4032; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] 4033; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] 4034; AVX512DQ-NEXT: vpor %xmm12, %xmm14, %xmm12 4035; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm5 ^ ymm1)) 4036; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] 4037; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] 4038; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm12 & ymm16) 4039; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm12 4040; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm13 ^ (ymm12 & (ymm6 ^ ymm13)) 4041; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero 4042; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm2 4043; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12] 4044; AVX512DQ-NEXT: vpor %xmm4, %xmm15, %xmm4 4045; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 4046; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] 4047; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm15 & (ymm4 ^ ymm14)) 4048; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] 4049; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] 4050; AVX512DQ-NEXT: vpor %xmm11, %xmm10, %xmm10 4051; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] 4052; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm10 & ymm16) 4053; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero 4054; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13] 4055; AVX512DQ-NEXT: vpor %xmm2, %xmm10, %xmm2 4056; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 4057; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm15 & (ymm2 ^ ymm9)) 4058; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm13 ^ ymm6)) 4059; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm6 4060; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14] 4061; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero 4062; AVX512DQ-NEXT: vpor %xmm9, %xmm10, %xmm9 4063; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 4064; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm3 ^ ymm17)) 4065; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3 4066; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] 4067; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] 4068; AVX512DQ-NEXT: vpor %xmm10, %xmm11, %xmm10 4069; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm5)) 4070; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] 4071; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7] 4072; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] 4073; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm9 ^ (ymm15 & (ymm5 ^ ymm9)) 4074; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15] 4075; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero 4076; AVX512DQ-NEXT: vpor %xmm6, %xmm8, %xmm6 4077; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 4078; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] 4079; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u] 4080; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0 4081; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] 4082; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] 4083; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 4084; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm6 ^ (ymm15 & (ymm0 ^ ymm6)) 4085; AVX512DQ-NEXT: vmovdqa64 %ymm18, (%rsi) 4086; AVX512DQ-NEXT: vmovdqa %ymm7, (%rdx) 4087; AVX512DQ-NEXT: vmovdqa %ymm4, (%rcx) 4088; AVX512DQ-NEXT: vmovdqa %ymm2, (%r8) 4089; AVX512DQ-NEXT: vmovdqa %ymm5, (%r9) 4090; AVX512DQ-NEXT: vmovdqa %ymm0, (%rax) 4091; AVX512DQ-NEXT: vzeroupper 4092; AVX512DQ-NEXT: retq 4093; 4094; AVX512DQ-FCP-LABEL: load_i8_stride6_vf32: 4095; AVX512DQ-FCP: # %bb.0: 4096; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 4097; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] 4098; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm17 4099; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 4100; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 4101; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 4102; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm7 4103; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm3 ^ (ymm7 & (ymm17 ^ ymm3)) 4104; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] 4105; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 4106; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u] 4107; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 4108; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],mem[2,3] 4109; AVX512DQ-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1 4110; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] 4111; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm10 4112; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm10 & (ymm1 ^ ymm5)) 4113; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4114; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] 4115; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm4 & ymm16) 4116; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 4117; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm14 4118; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm6 ^ (ymm14 & (ymm13 ^ ymm6)) 4119; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 4120; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10] 4121; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero 4122; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm12, %xmm4 4123; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 4124; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7],ymm11[8,9,10],ymm4[11,12,13,14,15] 4125; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm4[4,5,6,7] 4126; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm18 4127; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u] 4128; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u] 4129; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 4130; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 4131; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm16) 4132; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11] 4133; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero 4134; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7 4135; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 4136; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] 4137; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] 4138; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] 4139; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm10 4140; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm17 ^ (ymm10 & (ymm3 ^ ymm17)) 4141; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 4142; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] 4143; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] 4144; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm14, %xmm12 4145; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm5 ^ ymm1)) 4146; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] 4147; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] 4148; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm12 & ymm16) 4149; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm12 4150; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm13 ^ (ymm12 & (ymm6 ^ ymm13)) 4151; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero 4152; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm2 4153; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12] 4154; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm15, %xmm4 4155; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 4156; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] 4157; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm15 & (ymm4 ^ ymm14)) 4158; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] 4159; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] 4160; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 4161; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] 4162; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm10 & ymm16) 4163; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero 4164; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13] 4165; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm10, %xmm2 4166; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 4167; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm15 & (ymm2 ^ ymm9)) 4168; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm13 ^ ymm6)) 4169; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm6 4170; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14] 4171; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero 4172; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 4173; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 4174; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm3 ^ ymm17)) 4175; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 4176; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u] 4177; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] 4178; AVX512DQ-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 4179; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm5)) 4180; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] 4181; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7] 4182; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] 4183; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm9 ^ (ymm15 & (ymm5 ^ ymm9)) 4184; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15] 4185; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero 4186; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6 4187; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 4188; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u] 4189; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u] 4190; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm0, %xmm0 4191; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] 4192; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] 4193; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 4194; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm6 ^ (ymm15 & (ymm0 ^ ymm6)) 4195; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, (%rsi) 4196; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%rdx) 4197; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rcx) 4198; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r8) 4199; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%r9) 4200; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax) 4201; AVX512DQ-FCP-NEXT: vzeroupper 4202; AVX512DQ-FCP-NEXT: retq 4203; 4204; AVX512BW-LABEL: load_i8_stride6_vf32: 4205; AVX512BW: # %bb.0: 4206; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 4207; AVX512BW-NEXT: vmovdqa (%rdi), %ymm4 4208; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0 4209; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm3 4210; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm2 4211; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] 4212; AVX512BW-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm8 4213; AVX512BW-NEXT: movw $-28124, %r10w # imm = 0x9224 4214; AVX512BW-NEXT: kmovd %r10d, %k2 4215; AVX512BW-NEXT: vpblendmw %ymm1, %ymm8, %ymm6 {%k2} 4216; AVX512BW-NEXT: movw $18724, %r10w # imm = 0x4924 4217; AVX512BW-NEXT: kmovd %r10d, %k1 4218; AVX512BW-NEXT: vpblendmw %ymm0, %ymm4, %ymm7 {%k1} 4219; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] 4220; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm9 4221; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12,u,u,u,u,u] 4222; AVX512BW-NEXT: vpor %xmm3, %xmm5, %xmm5 4223; AVX512BW-NEXT: movl $4192256, %r10d # imm = 0x3FF800 4224; AVX512BW-NEXT: kmovd %r10d, %k3 4225; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] 4226; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm3 4227; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm10 {%k1} 4228; AVX512BW-NEXT: vextracti128 $1, %ymm10, %xmm11 4229; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[0,6,12],zero,zero,zero,xmm11[4,10] 4230; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,4,10],zero,zero,zero,xmm10[2,8,14],zero,zero 4231; AVX512BW-NEXT: vpor %xmm12, %xmm13, %xmm12 4232; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 4233; AVX512BW-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0,1,2],ymm12[3,4,5,6,7],ymm5[8,9,10],ymm12[11,12,13,14,15] 4234; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] 4235; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u] 4236; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13,u,u,u,u,u] 4237; AVX512BW-NEXT: vpor %xmm7, %xmm9, %xmm7 4238; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] 4239; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[1,7,13],zero,zero,zero,xmm11[5,11] 4240; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,5,11],zero,zero,zero,xmm10[3,9,15],zero,zero 4241; AVX512BW-NEXT: vpor %xmm6, %xmm9, %xmm6 4242; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 4243; AVX512BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] 4244; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 4245; AVX512BW-NEXT: vpblendmw %ymm8, %ymm1, %ymm9 {%k2} 4246; AVX512BW-NEXT: movw $9362, %di # imm = 0x2492 4247; AVX512BW-NEXT: kmovd %edi, %k3 4248; AVX512BW-NEXT: vpblendmw %ymm4, %ymm0, %ymm10 {%k3} 4249; AVX512BW-NEXT: vextracti128 $1, %ymm10, %xmm11 4250; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] 4251; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] 4252; AVX512BW-NEXT: vpor %xmm7, %xmm12, %xmm7 4253; AVX512BW-NEXT: movl $2095104, %edi # imm = 0x1FF800 4254; AVX512BW-NEXT: kmovd %edi, %k4 4255; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] 4256; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm12 {%k1} 4257; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero 4258; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm14 4259; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[2,8,14],zero,zero,xmm14[0,6,12] 4260; AVX512BW-NEXT: vpor %xmm13, %xmm15, %xmm13 4261; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 4262; AVX512BW-NEXT: movl $-2097152, %edi # imm = 0xFFE00000 4263; AVX512BW-NEXT: kmovd %edi, %k2 4264; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm7 {%k2} 4265; AVX512BW-NEXT: movw $9289, %di # imm = 0x2449 4266; AVX512BW-NEXT: kmovd %edi, %k5 4267; AVX512BW-NEXT: vmovdqu16 %ymm8, %ymm1 {%k5} 4268; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] 4269; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] 4270; AVX512BW-NEXT: vpor %xmm8, %xmm10, %xmm8 4271; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] 4272; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero 4273; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[3,9,15],zero,zero,xmm14[1,7,13] 4274; AVX512BW-NEXT: vpor %xmm9, %xmm10, %xmm9 4275; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 4276; AVX512BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k2} 4277; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] 4278; AVX512BW-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1} 4279; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm4 4280; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u] 4281; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] 4282; AVX512BW-NEXT: vpor %xmm10, %xmm11, %xmm10 4283; AVX512BW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm9[5,6,7] 4284; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] 4285; AVX512BW-NEXT: vmovdqu16 %ymm2, %ymm3 {%k3} 4286; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm2 4287; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[2,8,14] 4288; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,2,8,14],zero,zero,xmm3[0,6,12],zero,zero,zero 4289; AVX512BW-NEXT: vpor %xmm10, %xmm11, %xmm10 4290; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 4291; AVX512BW-NEXT: vmovdqu8 %ymm10, %ymm9 {%k2} 4292; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] 4293; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u] 4294; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u] 4295; AVX512BW-NEXT: vpor %xmm4, %xmm0, %xmm0 4296; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] 4297; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 4298; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[3,9,15] 4299; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,3,9,15],zero,zero,xmm3[1,7,13],zero,zero,zero 4300; AVX512BW-NEXT: vpor %xmm1, %xmm2, %xmm1 4301; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 4302; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} 4303; AVX512BW-NEXT: vmovdqa %ymm5, (%rsi) 4304; AVX512BW-NEXT: vmovdqa %ymm6, (%rdx) 4305; AVX512BW-NEXT: vmovdqa %ymm7, (%rcx) 4306; AVX512BW-NEXT: vmovdqa %ymm8, (%r8) 4307; AVX512BW-NEXT: vmovdqa %ymm9, (%r9) 4308; AVX512BW-NEXT: vmovdqa %ymm0, (%rax) 4309; AVX512BW-NEXT: vzeroupper 4310; AVX512BW-NEXT: retq 4311; 4312; AVX512BW-FCP-LABEL: load_i8_stride6_vf32: 4313; AVX512BW-FCP: # %bb.0: 4314; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 4315; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm4 4316; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 4317; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 4318; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 4319; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] 4320; AVX512BW-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm8 4321; AVX512BW-FCP-NEXT: movw $-28124, %r10w # imm = 0x9224 4322; AVX512BW-FCP-NEXT: kmovd %r10d, %k2 4323; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm8, %ymm6 {%k2} 4324; AVX512BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 4325; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 4326; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm4, %ymm7 {%k1} 4327; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] 4328; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9 4329; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12,u,u,u,u,u] 4330; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm5, %xmm5 4331; AVX512BW-FCP-NEXT: movl $4192256, %r10d # imm = 0x3FF800 4332; AVX512BW-FCP-NEXT: kmovd %r10d, %k3 4333; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] 4334; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm3 4335; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm10 {%k1} 4336; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 4337; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[0,6,12],zero,zero,zero,xmm11[4,10] 4338; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,4,10],zero,zero,zero,xmm10[2,8,14],zero,zero 4339; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm13, %xmm12 4340; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 4341; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0,1,2],ymm12[3,4,5,6,7],ymm5[8,9,10],ymm12[11,12,13,14,15] 4342; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] 4343; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u] 4344; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13,u,u,u,u,u] 4345; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7 4346; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] 4347; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[1,7,13],zero,zero,zero,xmm11[5,11] 4348; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,5,11],zero,zero,zero,xmm10[3,9,15],zero,zero 4349; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 4350; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 4351; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] 4352; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 4353; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm1, %ymm9 {%k2} 4354; AVX512BW-FCP-NEXT: movw $9362, %di # imm = 0x2492 4355; AVX512BW-FCP-NEXT: kmovd %edi, %k3 4356; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm0, %ymm10 {%k3} 4357; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 4358; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] 4359; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] 4360; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm12, %xmm7 4361; AVX512BW-FCP-NEXT: movl $2095104, %edi # imm = 0x1FF800 4362; AVX512BW-FCP-NEXT: kmovd %edi, %k4 4363; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] 4364; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm12 {%k1} 4365; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero 4366; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14 4367; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[2,8,14],zero,zero,xmm14[0,6,12] 4368; AVX512BW-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13 4369; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 4370; AVX512BW-FCP-NEXT: movl $-2097152, %edi # imm = 0xFFE00000 4371; AVX512BW-FCP-NEXT: kmovd %edi, %k2 4372; AVX512BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm7 {%k2} 4373; AVX512BW-FCP-NEXT: movw $9289, %di # imm = 0x2449 4374; AVX512BW-FCP-NEXT: kmovd %edi, %k5 4375; AVX512BW-FCP-NEXT: vmovdqu16 %ymm8, %ymm1 {%k5} 4376; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] 4377; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] 4378; AVX512BW-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8 4379; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] 4380; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero 4381; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[3,9,15],zero,zero,xmm14[1,7,13] 4382; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 4383; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 4384; AVX512BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm8 {%k2} 4385; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] 4386; AVX512BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1} 4387; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 4388; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u] 4389; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] 4390; AVX512BW-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 4391; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm9[5,6,7] 4392; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] 4393; AVX512BW-FCP-NEXT: vmovdqu16 %ymm2, %ymm3 {%k3} 4394; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm2 4395; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[2,8,14] 4396; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,2,8,14],zero,zero,xmm3[0,6,12],zero,zero,zero 4397; AVX512BW-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 4398; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 4399; AVX512BW-FCP-NEXT: vmovdqu8 %ymm10, %ymm9 {%k2} 4400; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] 4401; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u] 4402; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u] 4403; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm0, %xmm0 4404; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] 4405; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 4406; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[3,9,15] 4407; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,3,9,15],zero,zero,xmm3[1,7,13],zero,zero,zero 4408; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 4409; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 4410; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} 4411; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%rsi) 4412; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%rdx) 4413; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%rcx) 4414; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%r8) 4415; AVX512BW-FCP-NEXT: vmovdqa %ymm9, (%r9) 4416; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax) 4417; AVX512BW-FCP-NEXT: vzeroupper 4418; AVX512BW-FCP-NEXT: retq 4419; 4420; AVX512DQ-BW-LABEL: load_i8_stride6_vf32: 4421; AVX512DQ-BW: # %bb.0: 4422; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 4423; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm4 4424; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm0 4425; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm3 4426; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm2 4427; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] 4428; AVX512DQ-BW-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm8 4429; AVX512DQ-BW-NEXT: movw $-28124, %r10w # imm = 0x9224 4430; AVX512DQ-BW-NEXT: kmovd %r10d, %k2 4431; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm8, %ymm6 {%k2} 4432; AVX512DQ-BW-NEXT: movw $18724, %r10w # imm = 0x4924 4433; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 4434; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm4, %ymm7 {%k1} 4435; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] 4436; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm9 4437; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12,u,u,u,u,u] 4438; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm5, %xmm5 4439; AVX512DQ-BW-NEXT: movl $4192256, %r10d # imm = 0x3FF800 4440; AVX512DQ-BW-NEXT: kmovd %r10d, %k3 4441; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] 4442; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm3 4443; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm10 {%k1} 4444; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm10, %xmm11 4445; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[0,6,12],zero,zero,zero,xmm11[4,10] 4446; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,4,10],zero,zero,zero,xmm10[2,8,14],zero,zero 4447; AVX512DQ-BW-NEXT: vpor %xmm12, %xmm13, %xmm12 4448; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 4449; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0,1,2],ymm12[3,4,5,6,7],ymm5[8,9,10],ymm12[11,12,13,14,15] 4450; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] 4451; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u] 4452; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13,u,u,u,u,u] 4453; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm9, %xmm7 4454; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] 4455; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[1,7,13],zero,zero,zero,xmm11[5,11] 4456; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,5,11],zero,zero,zero,xmm10[3,9,15],zero,zero 4457; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm9, %xmm6 4458; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 4459; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] 4460; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 4461; AVX512DQ-BW-NEXT: vpblendmw %ymm8, %ymm1, %ymm9 {%k2} 4462; AVX512DQ-BW-NEXT: movw $9362, %di # imm = 0x2492 4463; AVX512DQ-BW-NEXT: kmovd %edi, %k3 4464; AVX512DQ-BW-NEXT: vpblendmw %ymm4, %ymm0, %ymm10 {%k3} 4465; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm10, %xmm11 4466; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] 4467; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] 4468; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm12, %xmm7 4469; AVX512DQ-BW-NEXT: movl $2095104, %edi # imm = 0x1FF800 4470; AVX512DQ-BW-NEXT: kmovd %edi, %k4 4471; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] 4472; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm12 {%k1} 4473; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero 4474; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm12, %xmm14 4475; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[2,8,14],zero,zero,xmm14[0,6,12] 4476; AVX512DQ-BW-NEXT: vpor %xmm13, %xmm15, %xmm13 4477; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 4478; AVX512DQ-BW-NEXT: movl $-2097152, %edi # imm = 0xFFE00000 4479; AVX512DQ-BW-NEXT: kmovd %edi, %k2 4480; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm7 {%k2} 4481; AVX512DQ-BW-NEXT: movw $9289, %di # imm = 0x2449 4482; AVX512DQ-BW-NEXT: kmovd %edi, %k5 4483; AVX512DQ-BW-NEXT: vmovdqu16 %ymm8, %ymm1 {%k5} 4484; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] 4485; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] 4486; AVX512DQ-BW-NEXT: vpor %xmm8, %xmm10, %xmm8 4487; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] 4488; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero 4489; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[3,9,15],zero,zero,xmm14[1,7,13] 4490; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm10, %xmm9 4491; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 4492; AVX512DQ-BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k2} 4493; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] 4494; AVX512DQ-BW-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1} 4495; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm4 4496; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u] 4497; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] 4498; AVX512DQ-BW-NEXT: vpor %xmm10, %xmm11, %xmm10 4499; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm9[5,6,7] 4500; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] 4501; AVX512DQ-BW-NEXT: vmovdqu16 %ymm2, %ymm3 {%k3} 4502; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm2 4503; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[2,8,14] 4504; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,2,8,14],zero,zero,xmm3[0,6,12],zero,zero,zero 4505; AVX512DQ-BW-NEXT: vpor %xmm10, %xmm11, %xmm10 4506; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 4507; AVX512DQ-BW-NEXT: vmovdqu8 %ymm10, %ymm9 {%k2} 4508; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] 4509; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u] 4510; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u] 4511; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm0, %xmm0 4512; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] 4513; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 4514; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[3,9,15] 4515; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,3,9,15],zero,zero,xmm3[1,7,13],zero,zero,zero 4516; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm2, %xmm1 4517; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 4518; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} 4519; AVX512DQ-BW-NEXT: vmovdqa %ymm5, (%rsi) 4520; AVX512DQ-BW-NEXT: vmovdqa %ymm6, (%rdx) 4521; AVX512DQ-BW-NEXT: vmovdqa %ymm7, (%rcx) 4522; AVX512DQ-BW-NEXT: vmovdqa %ymm8, (%r8) 4523; AVX512DQ-BW-NEXT: vmovdqa %ymm9, (%r9) 4524; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax) 4525; AVX512DQ-BW-NEXT: vzeroupper 4526; AVX512DQ-BW-NEXT: retq 4527; 4528; AVX512DQ-BW-FCP-LABEL: load_i8_stride6_vf32: 4529; AVX512DQ-BW-FCP: # %bb.0: 4530; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 4531; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm4 4532; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 4533; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 4534; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 4535; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] 4536; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm8 4537; AVX512DQ-BW-FCP-NEXT: movw $-28124, %r10w # imm = 0x9224 4538; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 4539; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm8, %ymm6 {%k2} 4540; AVX512DQ-BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 4541; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 4542; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm4, %ymm7 {%k1} 4543; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u] 4544; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9 4545; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12,u,u,u,u,u] 4546; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm5, %xmm5 4547; AVX512DQ-BW-FCP-NEXT: movl $4192256, %r10d # imm = 0x3FF800 4548; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k3 4549; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] 4550; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm3 4551; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm10 {%k1} 4552; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 4553; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[0,6,12],zero,zero,zero,xmm11[4,10] 4554; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,4,10],zero,zero,zero,xmm10[2,8,14],zero,zero 4555; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm13, %xmm12 4556; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 4557; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0,1,2],ymm12[3,4,5,6,7],ymm5[8,9,10],ymm12[11,12,13,14,15] 4558; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] 4559; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u] 4560; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13,u,u,u,u,u] 4561; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7 4562; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] 4563; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[1,7,13],zero,zero,zero,xmm11[5,11] 4564; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,5,11],zero,zero,zero,xmm10[3,9,15],zero,zero 4565; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 4566; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 4567; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] 4568; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] 4569; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm1, %ymm9 {%k2} 4570; AVX512DQ-BW-FCP-NEXT: movw $9362, %di # imm = 0x2492 4571; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k3 4572; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm0, %ymm10 {%k3} 4573; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 4574; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u] 4575; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u] 4576; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm12, %xmm7 4577; AVX512DQ-BW-FCP-NEXT: movl $2095104, %edi # imm = 0x1FF800 4578; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k4 4579; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u] 4580; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm12 {%k1} 4581; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero 4582; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14 4583; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[2,8,14],zero,zero,xmm14[0,6,12] 4584; AVX512DQ-BW-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13 4585; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 4586; AVX512DQ-BW-FCP-NEXT: movl $-2097152, %edi # imm = 0xFFE00000 4587; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 4588; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm7 {%k2} 4589; AVX512DQ-BW-FCP-NEXT: movw $9289, %di # imm = 0x2449 4590; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k5 4591; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm8, %ymm1 {%k5} 4592; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u] 4593; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u] 4594; AVX512DQ-BW-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8 4595; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u] 4596; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero 4597; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[3,9,15],zero,zero,xmm14[1,7,13] 4598; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 4599; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 4600; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm8 {%k2} 4601; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] 4602; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1} 4603; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 4604; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u] 4605; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u] 4606; AVX512DQ-BW-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 4607; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm9[5,6,7] 4608; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] 4609; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm2, %ymm3 {%k3} 4610; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm2 4611; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[2,8,14] 4612; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,2,8,14],zero,zero,xmm3[0,6,12],zero,zero,zero 4613; AVX512DQ-BW-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 4614; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 4615; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm10, %ymm9 {%k2} 4616; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] 4617; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u] 4618; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u] 4619; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm0, %xmm0 4620; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] 4621; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 4622; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[3,9,15] 4623; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,3,9,15],zero,zero,xmm3[1,7,13],zero,zero,zero 4624; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 4625; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 4626; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} 4627; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%rsi) 4628; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%rdx) 4629; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%rcx) 4630; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%r8) 4631; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm9, (%r9) 4632; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax) 4633; AVX512DQ-BW-FCP-NEXT: vzeroupper 4634; AVX512DQ-BW-FCP-NEXT: retq 4635 %wide.vec = load <192 x i8>, ptr %in.vec, align 64 4636 %strided.vec0 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <32 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90, i32 96, i32 102, i32 108, i32 114, i32 120, i32 126, i32 132, i32 138, i32 144, i32 150, i32 156, i32 162, i32 168, i32 174, i32 180, i32 186> 4637 %strided.vec1 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <32 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43, i32 49, i32 55, i32 61, i32 67, i32 73, i32 79, i32 85, i32 91, i32 97, i32 103, i32 109, i32 115, i32 121, i32 127, i32 133, i32 139, i32 145, i32 151, i32 157, i32 163, i32 169, i32 175, i32 181, i32 187> 4638 %strided.vec2 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <32 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44, i32 50, i32 56, i32 62, i32 68, i32 74, i32 80, i32 86, i32 92, i32 98, i32 104, i32 110, i32 116, i32 122, i32 128, i32 134, i32 140, i32 146, i32 152, i32 158, i32 164, i32 170, i32 176, i32 182, i32 188> 4639 %strided.vec3 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <32 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45, i32 51, i32 57, i32 63, i32 69, i32 75, i32 81, i32 87, i32 93, i32 99, i32 105, i32 111, i32 117, i32 123, i32 129, i32 135, i32 141, i32 147, i32 153, i32 159, i32 165, i32 171, i32 177, i32 183, i32 189> 4640 %strided.vec4 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <32 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46, i32 52, i32 58, i32 64, i32 70, i32 76, i32 82, i32 88, i32 94, i32 100, i32 106, i32 112, i32 118, i32 124, i32 130, i32 136, i32 142, i32 148, i32 154, i32 160, i32 166, i32 172, i32 178, i32 184, i32 190> 4641 %strided.vec5 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <32 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47, i32 53, i32 59, i32 65, i32 71, i32 77, i32 83, i32 89, i32 95, i32 101, i32 107, i32 113, i32 119, i32 125, i32 131, i32 137, i32 143, i32 149, i32 155, i32 161, i32 167, i32 173, i32 179, i32 185, i32 191> 4642 store <32 x i8> %strided.vec0, ptr %out.vec0, align 64 4643 store <32 x i8> %strided.vec1, ptr %out.vec1, align 64 4644 store <32 x i8> %strided.vec2, ptr %out.vec2, align 64 4645 store <32 x i8> %strided.vec3, ptr %out.vec3, align 64 4646 store <32 x i8> %strided.vec4, ptr %out.vec4, align 64 4647 store <32 x i8> %strided.vec5, ptr %out.vec5, align 64 4648 ret void 4649} 4650 4651define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { 4652; SSE-LABEL: load_i8_stride6_vf64: 4653; SSE: # %bb.0: 4654; SSE-NEXT: subq $792, %rsp # imm = 0x318 4655; SSE-NEXT: movdqa 64(%rdi), %xmm4 4656; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4657; SSE-NEXT: movdqa 80(%rdi), %xmm5 4658; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4659; SSE-NEXT: movdqa (%rdi), %xmm7 4660; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4661; SSE-NEXT: movdqa 16(%rdi), %xmm6 4662; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4663; SSE-NEXT: movdqa 32(%rdi), %xmm2 4664; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4665; SSE-NEXT: movdqa 48(%rdi), %xmm0 4666; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,65535,65535,0,65535,65535] 4667; SSE-NEXT: movdqa %xmm13, %xmm1 4668; SSE-NEXT: pandn %xmm2, %xmm1 4669; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,0,65535,65535,0] 4670; SSE-NEXT: movdqa %xmm3, %xmm2 4671; SSE-NEXT: pandn %xmm0, %xmm2 4672; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4673; SSE-NEXT: movdqa %xmm13, %xmm2 4674; SSE-NEXT: pandn %xmm0, %xmm2 4675; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4676; SSE-NEXT: pand %xmm13, %xmm0 4677; SSE-NEXT: por %xmm1, %xmm0 4678; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4679; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255] 4680; SSE-NEXT: pand %xmm10, %xmm0 4681; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] 4682; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] 4683; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] 4684; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] 4685; SSE-NEXT: packuswb %xmm1, %xmm0 4686; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535] 4687; SSE-NEXT: movdqa %xmm3, %xmm1 4688; SSE-NEXT: pandn %xmm6, %xmm1 4689; SSE-NEXT: movdqa %xmm7, %xmm2 4690; SSE-NEXT: pand %xmm3, %xmm2 4691; SSE-NEXT: por %xmm1, %xmm2 4692; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4693; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3] 4694; SSE-NEXT: pand %xmm10, %xmm1 4695; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 4696; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] 4697; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] 4698; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] 4699; SSE-NEXT: packuswb %xmm1, %xmm1 4700; SSE-NEXT: pand %xmm8, %xmm1 4701; SSE-NEXT: movdqa %xmm8, %xmm2 4702; SSE-NEXT: pandn %xmm0, %xmm2 4703; SSE-NEXT: por %xmm2, %xmm1 4704; SSE-NEXT: movdqa %xmm13, %xmm0 4705; SSE-NEXT: pandn %xmm5, %xmm0 4706; SSE-NEXT: pand %xmm13, %xmm4 4707; SSE-NEXT: por %xmm0, %xmm4 4708; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4709; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,1,2,0] 4710; SSE-NEXT: pand %xmm10, %xmm0 4711; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] 4712; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] 4713; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] 4714; SSE-NEXT: packuswb %xmm0, %xmm0 4715; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] 4716; SSE-NEXT: movdqa %xmm4, %xmm2 4717; SSE-NEXT: pandn %xmm0, %xmm2 4718; SSE-NEXT: pand %xmm4, %xmm1 4719; SSE-NEXT: por %xmm1, %xmm2 4720; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4721; SSE-NEXT: movdqa 320(%rdi), %xmm1 4722; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4723; SSE-NEXT: movdqa %xmm13, %xmm0 4724; SSE-NEXT: pandn %xmm1, %xmm0 4725; SSE-NEXT: movdqa 336(%rdi), %xmm12 4726; SSE-NEXT: movdqa %xmm3, %xmm1 4727; SSE-NEXT: pandn %xmm12, %xmm1 4728; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4729; SSE-NEXT: movdqa %xmm13, %xmm1 4730; SSE-NEXT: pandn %xmm12, %xmm1 4731; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4732; SSE-NEXT: pand %xmm13, %xmm12 4733; SSE-NEXT: por %xmm0, %xmm12 4734; SSE-NEXT: movdqa %xmm12, %xmm0 4735; SSE-NEXT: pand %xmm10, %xmm0 4736; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] 4737; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] 4738; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] 4739; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] 4740; SSE-NEXT: packuswb %xmm1, %xmm0 4741; SSE-NEXT: movdqa %xmm8, %xmm1 4742; SSE-NEXT: pandn %xmm0, %xmm1 4743; SSE-NEXT: movdqa 304(%rdi), %xmm2 4744; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4745; SSE-NEXT: movdqa %xmm3, %xmm7 4746; SSE-NEXT: movdqa %xmm3, %xmm0 4747; SSE-NEXT: pandn %xmm2, %xmm0 4748; SSE-NEXT: movdqa 288(%rdi), %xmm6 4749; SSE-NEXT: movdqa %xmm6, %xmm2 4750; SSE-NEXT: pand %xmm3, %xmm2 4751; SSE-NEXT: por %xmm0, %xmm2 4752; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4753; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3] 4754; SSE-NEXT: pand %xmm10, %xmm0 4755; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] 4756; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4757; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] 4758; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] 4759; SSE-NEXT: packuswb %xmm0, %xmm0 4760; SSE-NEXT: pand %xmm8, %xmm0 4761; SSE-NEXT: por %xmm1, %xmm0 4762; SSE-NEXT: movdqa 368(%rdi), %xmm1 4763; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4764; SSE-NEXT: movdqa %xmm13, %xmm2 4765; SSE-NEXT: pandn %xmm1, %xmm2 4766; SSE-NEXT: movdqa 352(%rdi), %xmm3 4767; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4768; SSE-NEXT: pand %xmm13, %xmm3 4769; SSE-NEXT: por %xmm2, %xmm3 4770; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4771; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,0] 4772; SSE-NEXT: pand %xmm10, %xmm2 4773; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] 4774; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,0] 4775; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] 4776; SSE-NEXT: packuswb %xmm2, %xmm2 4777; SSE-NEXT: movdqa %xmm4, %xmm3 4778; SSE-NEXT: pandn %xmm2, %xmm3 4779; SSE-NEXT: pand %xmm4, %xmm0 4780; SSE-NEXT: movdqa %xmm4, %xmm9 4781; SSE-NEXT: por %xmm0, %xmm3 4782; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4783; SSE-NEXT: movdqa 224(%rdi), %xmm1 4784; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4785; SSE-NEXT: movdqa %xmm13, %xmm0 4786; SSE-NEXT: pandn %xmm1, %xmm0 4787; SSE-NEXT: movdqa 240(%rdi), %xmm11 4788; SSE-NEXT: movdqa %xmm7, %xmm2 4789; SSE-NEXT: pandn %xmm11, %xmm2 4790; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4791; SSE-NEXT: movdqa %xmm13, %xmm2 4792; SSE-NEXT: pandn %xmm11, %xmm2 4793; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4794; SSE-NEXT: pand %xmm13, %xmm11 4795; SSE-NEXT: por %xmm0, %xmm11 4796; SSE-NEXT: movdqa %xmm11, %xmm0 4797; SSE-NEXT: pand %xmm10, %xmm0 4798; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,3,2,3,4,5,6,7] 4799; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] 4800; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] 4801; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] 4802; SSE-NEXT: packuswb %xmm2, %xmm0 4803; SSE-NEXT: movdqa %xmm8, %xmm2 4804; SSE-NEXT: pandn %xmm0, %xmm2 4805; SSE-NEXT: movdqa 208(%rdi), %xmm1 4806; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4807; SSE-NEXT: movdqa %xmm7, %xmm0 4808; SSE-NEXT: pandn %xmm1, %xmm0 4809; SSE-NEXT: movdqa 192(%rdi), %xmm3 4810; SSE-NEXT: movdqa %xmm3, %xmm1 4811; SSE-NEXT: pand %xmm7, %xmm1 4812; SSE-NEXT: por %xmm0, %xmm1 4813; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4814; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,1,3] 4815; SSE-NEXT: movdqa %xmm10, %xmm1 4816; SSE-NEXT: pand %xmm10, %xmm0 4817; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] 4818; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4819; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] 4820; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] 4821; SSE-NEXT: packuswb %xmm0, %xmm0 4822; SSE-NEXT: pand %xmm8, %xmm0 4823; SSE-NEXT: movdqa %xmm8, %xmm10 4824; SSE-NEXT: por %xmm2, %xmm0 4825; SSE-NEXT: movdqa 272(%rdi), %xmm14 4826; SSE-NEXT: movdqa %xmm13, %xmm2 4827; SSE-NEXT: pandn %xmm14, %xmm2 4828; SSE-NEXT: movdqa 256(%rdi), %xmm15 4829; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4830; SSE-NEXT: pand %xmm13, %xmm15 4831; SSE-NEXT: por %xmm2, %xmm15 4832; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[3,1,2,0] 4833; SSE-NEXT: pand %xmm1, %xmm2 4834; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] 4835; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,0] 4836; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] 4837; SSE-NEXT: packuswb %xmm2, %xmm2 4838; SSE-NEXT: pandn %xmm2, %xmm4 4839; SSE-NEXT: pand %xmm9, %xmm0 4840; SSE-NEXT: por %xmm0, %xmm4 4841; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4842; SSE-NEXT: movdqa 128(%rdi), %xmm2 4843; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill 4844; SSE-NEXT: movdqa %xmm13, %xmm0 4845; SSE-NEXT: pandn %xmm2, %xmm0 4846; SSE-NEXT: movdqa 144(%rdi), %xmm9 4847; SSE-NEXT: movdqa %xmm7, %xmm4 4848; SSE-NEXT: pandn %xmm9, %xmm4 4849; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4850; SSE-NEXT: movdqa %xmm13, %xmm4 4851; SSE-NEXT: pandn %xmm9, %xmm4 4852; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4853; SSE-NEXT: pand %xmm13, %xmm9 4854; SSE-NEXT: por %xmm0, %xmm9 4855; SSE-NEXT: movdqa %xmm9, %xmm0 4856; SSE-NEXT: pand %xmm1, %xmm0 4857; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,3,2,3,4,5,6,7] 4858; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] 4859; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] 4860; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] 4861; SSE-NEXT: packuswb %xmm5, %xmm0 4862; SSE-NEXT: pandn %xmm0, %xmm10 4863; SSE-NEXT: movdqa %xmm13, %xmm0 4864; SSE-NEXT: movdqa %xmm13, %xmm2 4865; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4866; SSE-NEXT: pandn %xmm1, %xmm2 4867; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4868; SSE-NEXT: movdqa %xmm13, %xmm2 4869; SSE-NEXT: pandn %xmm6, %xmm2 4870; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4871; SSE-NEXT: movdqa %xmm6, %xmm5 4872; SSE-NEXT: movdqa %xmm13, %xmm2 4873; SSE-NEXT: pandn %xmm3, %xmm2 4874; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4875; SSE-NEXT: movdqa %xmm3, %xmm4 4876; SSE-NEXT: movdqa 112(%rdi), %xmm6 4877; SSE-NEXT: movdqa %xmm7, %xmm2 4878; SSE-NEXT: movdqa %xmm7, %xmm8 4879; SSE-NEXT: pandn %xmm6, %xmm8 4880; SSE-NEXT: movdqa 160(%rdi), %xmm7 4881; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4882; SSE-NEXT: pand %xmm13, %xmm7 4883; SSE-NEXT: movdqa %xmm13, %xmm3 4884; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 4885; SSE-NEXT: pandn %xmm13, %xmm3 4886; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4887; SSE-NEXT: pand %xmm0, %xmm1 4888; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4889; SSE-NEXT: movdqa %xmm2, %xmm3 4890; SSE-NEXT: movdqa %xmm2, %xmm1 4891; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4892; SSE-NEXT: pandn %xmm2, %xmm3 4893; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4894; SSE-NEXT: pand %xmm0, %xmm2 4895; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4896; SSE-NEXT: movdqa %xmm0, %xmm2 4897; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 4898; SSE-NEXT: pandn %xmm3, %xmm2 4899; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4900; SSE-NEXT: pand %xmm0, %xmm5 4901; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4902; SSE-NEXT: movdqa %xmm1, %xmm5 4903; SSE-NEXT: movdqa %xmm1, %xmm2 4904; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4905; SSE-NEXT: pandn %xmm1, %xmm2 4906; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4907; SSE-NEXT: pand %xmm0, %xmm1 4908; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4909; SSE-NEXT: movdqa %xmm0, %xmm2 4910; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4911; SSE-NEXT: pandn %xmm1, %xmm2 4912; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4913; SSE-NEXT: pand %xmm0, %xmm4 4914; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4915; SSE-NEXT: pandn %xmm14, %xmm5 4916; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4917; SSE-NEXT: pand %xmm0, %xmm14 4918; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4919; SSE-NEXT: movdqa %xmm0, %xmm2 4920; SSE-NEXT: pandn %xmm6, %xmm2 4921; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4922; SSE-NEXT: movdqa 96(%rdi), %xmm4 4923; SSE-NEXT: movdqa %xmm4, %xmm2 4924; SSE-NEXT: pand %xmm0, %xmm2 4925; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4926; SSE-NEXT: movdqa 176(%rdi), %xmm14 4927; SSE-NEXT: movdqa %xmm14, %xmm2 4928; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4929; SSE-NEXT: pand %xmm0, %xmm2 4930; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4931; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4932; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4933; SSE-NEXT: pand %xmm0, %xmm2 4934; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4935; SSE-NEXT: pand %xmm0, %xmm13 4936; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4937; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4938; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4939; SSE-NEXT: pand %xmm0, %xmm2 4940; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4941; SSE-NEXT: pand %xmm0, %xmm3 4942; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4943; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 4944; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4945; SSE-NEXT: pand %xmm0, %xmm2 4946; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4947; SSE-NEXT: pand %xmm0, %xmm1 4948; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4949; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload 4950; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4951; SSE-NEXT: pand %xmm0, %xmm1 4952; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill 4953; SSE-NEXT: movdqa %xmm0, %xmm1 4954; SSE-NEXT: pand %xmm0, %xmm6 4955; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4956; SSE-NEXT: movdqa %xmm0, %xmm13 4957; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4958; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4959; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4960; SSE-NEXT: pandn %xmm4, %xmm1 4961; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4962; SSE-NEXT: movdqa %xmm4, %xmm3 4963; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 4964; SSE-NEXT: por %xmm8, %xmm3 4965; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,2,1,3] 4966; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 4967; SSE-NEXT: pand %xmm1, %xmm5 4968; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] 4969; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] 4970; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7] 4971; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] 4972; SSE-NEXT: packuswb %xmm5, %xmm5 4973; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535] 4974; SSE-NEXT: pand %xmm8, %xmm5 4975; SSE-NEXT: por %xmm10, %xmm5 4976; SSE-NEXT: pandn %xmm14, %xmm0 4977; SSE-NEXT: por %xmm0, %xmm7 4978; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,1,2,0] 4979; SSE-NEXT: pand %xmm1, %xmm0 4980; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] 4981; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] 4982; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] 4983; SSE-NEXT: packuswb %xmm0, %xmm0 4984; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] 4985; SSE-NEXT: movdqa %xmm10, %xmm1 4986; SSE-NEXT: pandn %xmm0, %xmm1 4987; SSE-NEXT: pand %xmm10, %xmm5 4988; SSE-NEXT: por %xmm5, %xmm1 4989; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4990; SSE-NEXT: pxor %xmm5, %xmm5 4991; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 4992; SSE-NEXT: movdqa %xmm1, %xmm0 4993; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] 4994; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] 4995; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,2,3,3] 4996; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] 4997; SSE-NEXT: psrld $16, %xmm0 4998; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,1,0,3] 4999; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,7,6,7] 5000; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] 5001; SSE-NEXT: packuswb %xmm14, %xmm4 5002; SSE-NEXT: movdqa %xmm8, %xmm1 5003; SSE-NEXT: pandn %xmm4, %xmm1 5004; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 5005; SSE-NEXT: movdqa %xmm2, %xmm4 5006; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] 5007; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3] 5008; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] 5009; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] 5010; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,0,0,65535,65535] 5011; SSE-NEXT: movdqa %xmm0, %xmm14 5012; SSE-NEXT: pandn %xmm4, %xmm14 5013; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] 5014; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[3,1,2,3,4,5,6,7] 5015; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] 5016; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,0,4,5,6,7] 5017; SSE-NEXT: pand %xmm0, %xmm4 5018; SSE-NEXT: por %xmm14, %xmm4 5019; SSE-NEXT: packuswb %xmm4, %xmm4 5020; SSE-NEXT: pand %xmm8, %xmm4 5021; SSE-NEXT: por %xmm1, %xmm4 5022; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 5023; SSE-NEXT: movdqa %xmm6, %xmm1 5024; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] 5025; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,5,5,5,5] 5026; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,0,65535] 5027; SSE-NEXT: movdqa %xmm2, %xmm1 5028; SSE-NEXT: pandn %xmm14, %xmm1 5029; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] 5030; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm6[3,1,2,3,4,5,6,7] 5031; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,0,3] 5032; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,7,6,4] 5033; SSE-NEXT: pand %xmm2, %xmm14 5034; SSE-NEXT: por %xmm1, %xmm14 5035; SSE-NEXT: packuswb %xmm14, %xmm1 5036; SSE-NEXT: movdqa %xmm10, %xmm14 5037; SSE-NEXT: pandn %xmm1, %xmm14 5038; SSE-NEXT: pand %xmm10, %xmm4 5039; SSE-NEXT: por %xmm4, %xmm14 5040; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5041; SSE-NEXT: movdqa %xmm12, %xmm1 5042; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] 5043; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] 5044; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] 5045; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 5046; SSE-NEXT: psrld $16, %xmm1 5047; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,0,3] 5048; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,7,6,7] 5049; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] 5050; SSE-NEXT: packuswb %xmm12, %xmm4 5051; SSE-NEXT: movdqa %xmm8, %xmm14 5052; SSE-NEXT: movdqa %xmm8, %xmm1 5053; SSE-NEXT: pandn %xmm4, %xmm1 5054; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 5055; SSE-NEXT: movdqa %xmm6, %xmm4 5056; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] 5057; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3] 5058; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] 5059; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] 5060; SSE-NEXT: movdqa %xmm0, %xmm12 5061; SSE-NEXT: pandn %xmm4, %xmm12 5062; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] 5063; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7] 5064; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] 5065; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,0,4,5,6,7] 5066; SSE-NEXT: pand %xmm0, %xmm4 5067; SSE-NEXT: por %xmm12, %xmm4 5068; SSE-NEXT: packuswb %xmm4, %xmm4 5069; SSE-NEXT: pand %xmm8, %xmm4 5070; SSE-NEXT: por %xmm1, %xmm4 5071; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 5072; SSE-NEXT: movdqa %xmm6, %xmm1 5073; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] 5074; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 5075; SSE-NEXT: movdqa %xmm2, %xmm12 5076; SSE-NEXT: pandn %xmm1, %xmm12 5077; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] 5078; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[3,1,2,3,4,5,6,7] 5079; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] 5080; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4] 5081; SSE-NEXT: pand %xmm2, %xmm1 5082; SSE-NEXT: por %xmm12, %xmm1 5083; SSE-NEXT: packuswb %xmm1, %xmm1 5084; SSE-NEXT: movdqa %xmm10, %xmm12 5085; SSE-NEXT: pandn %xmm1, %xmm12 5086; SSE-NEXT: pand %xmm10, %xmm4 5087; SSE-NEXT: por %xmm4, %xmm12 5088; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5089; SSE-NEXT: movdqa %xmm11, %xmm1 5090; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] 5091; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] 5092; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,2,3,3] 5093; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 5094; SSE-NEXT: psrld $16, %xmm1 5095; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[0,1,0,3] 5096; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] 5097; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] 5098; SSE-NEXT: packuswb %xmm8, %xmm4 5099; SSE-NEXT: movdqa %xmm14, %xmm1 5100; SSE-NEXT: pandn %xmm4, %xmm1 5101; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 5102; SSE-NEXT: movdqa %xmm6, %xmm4 5103; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] 5104; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3] 5105; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] 5106; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] 5107; SSE-NEXT: movdqa %xmm0, %xmm8 5108; SSE-NEXT: pandn %xmm4, %xmm8 5109; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] 5110; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7] 5111; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] 5112; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,0,4,5,6,7] 5113; SSE-NEXT: pand %xmm0, %xmm4 5114; SSE-NEXT: por %xmm8, %xmm4 5115; SSE-NEXT: packuswb %xmm4, %xmm4 5116; SSE-NEXT: pand %xmm14, %xmm4 5117; SSE-NEXT: por %xmm1, %xmm4 5118; SSE-NEXT: movdqa %xmm15, %xmm1 5119; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] 5120; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 5121; SSE-NEXT: movdqa %xmm2, %xmm8 5122; SSE-NEXT: pandn %xmm1, %xmm8 5123; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm5[8],xmm15[9],xmm5[9],xmm15[10],xmm5[10],xmm15[11],xmm5[11],xmm15[12],xmm5[12],xmm15[13],xmm5[13],xmm15[14],xmm5[14],xmm15[15],xmm5[15] 5124; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[3,1,2,3,4,5,6,7] 5125; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] 5126; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4] 5127; SSE-NEXT: pand %xmm2, %xmm1 5128; SSE-NEXT: por %xmm8, %xmm1 5129; SSE-NEXT: packuswb %xmm1, %xmm1 5130; SSE-NEXT: movdqa %xmm10, %xmm8 5131; SSE-NEXT: pandn %xmm1, %xmm8 5132; SSE-NEXT: pand %xmm10, %xmm4 5133; SSE-NEXT: por %xmm4, %xmm8 5134; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5135; SSE-NEXT: movdqa %xmm9, %xmm1 5136; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] 5137; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] 5138; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[2,2,3,3] 5139; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] 5140; SSE-NEXT: psrld $16, %xmm1 5141; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,1,0,3] 5142; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] 5143; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] 5144; SSE-NEXT: packuswb %xmm6, %xmm4 5145; SSE-NEXT: movdqa %xmm3, %xmm1 5146; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] 5147; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] 5148; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] 5149; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] 5150; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 5151; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] 5152; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] 5153; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,3,2,0,4,5,6,7] 5154; SSE-NEXT: pand %xmm0, %xmm3 5155; SSE-NEXT: pandn %xmm1, %xmm0 5156; SSE-NEXT: por %xmm3, %xmm0 5157; SSE-NEXT: packuswb %xmm0, %xmm0 5158; SSE-NEXT: movdqa %xmm14, %xmm1 5159; SSE-NEXT: pand %xmm14, %xmm0 5160; SSE-NEXT: pandn %xmm4, %xmm1 5161; SSE-NEXT: por %xmm1, %xmm0 5162; SSE-NEXT: movdqa %xmm7, %xmm1 5163; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] 5164; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 5165; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] 5166; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7] 5167; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] 5168; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,4] 5169; SSE-NEXT: pand %xmm2, %xmm3 5170; SSE-NEXT: pandn %xmm1, %xmm2 5171; SSE-NEXT: por %xmm3, %xmm2 5172; SSE-NEXT: packuswb %xmm2, %xmm1 5173; SSE-NEXT: movdqa %xmm10, %xmm2 5174; SSE-NEXT: pandn %xmm1, %xmm2 5175; SSE-NEXT: pand %xmm10, %xmm0 5176; SSE-NEXT: por %xmm0, %xmm2 5177; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5178; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,65535,0] 5179; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5180; SSE-NEXT: pand %xmm7, %xmm4 5181; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 5182; SSE-NEXT: movdqa %xmm4, %xmm0 5183; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255] 5184; SSE-NEXT: pand %xmm12, %xmm0 5185; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] 5186; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 5187; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] 5188; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] 5189; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 5190; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] 5191; SSE-NEXT: packuswb %xmm1, %xmm0 5192; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 5193; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 5194; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[2,1,2,3,4,5,6,7] 5195; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] 5196; SSE-NEXT: pand %xmm12, %xmm1 5197; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] 5198; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] 5199; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,5,5,5,5] 5200; SSE-NEXT: packuswb %xmm2, %xmm2 5201; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] 5202; SSE-NEXT: movdqa %xmm5, %xmm3 5203; SSE-NEXT: pandn %xmm2, %xmm3 5204; SSE-NEXT: pand %xmm5, %xmm0 5205; SSE-NEXT: por %xmm0, %xmm3 5206; SSE-NEXT: movdqa %xmm13, %xmm0 5207; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 5208; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5209; SSE-NEXT: por %xmm0, %xmm1 5210; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5211; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7] 5212; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] 5213; SSE-NEXT: pand %xmm12, %xmm0 5214; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] 5215; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] 5216; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] 5217; SSE-NEXT: packuswb %xmm0, %xmm0 5218; SSE-NEXT: movdqa %xmm10, %xmm2 5219; SSE-NEXT: pandn %xmm0, %xmm2 5220; SSE-NEXT: pand %xmm10, %xmm3 5221; SSE-NEXT: por %xmm3, %xmm2 5222; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5223; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 5224; SSE-NEXT: pand %xmm7, %xmm13 5225; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 5226; SSE-NEXT: movdqa %xmm13, %xmm0 5227; SSE-NEXT: pand %xmm12, %xmm0 5228; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,7,6,7] 5229; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 5230; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7] 5231; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] 5232; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 5233; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] 5234; SSE-NEXT: packuswb %xmm2, %xmm0 5235; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 5236; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 5237; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[2,1,2,3,4,5,6,7] 5238; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] 5239; SSE-NEXT: pand %xmm12, %xmm2 5240; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] 5241; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] 5242; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] 5243; SSE-NEXT: packuswb %xmm2, %xmm2 5244; SSE-NEXT: movdqa %xmm5, %xmm3 5245; SSE-NEXT: pandn %xmm2, %xmm3 5246; SSE-NEXT: pand %xmm5, %xmm0 5247; SSE-NEXT: por %xmm0, %xmm3 5248; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5249; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 5250; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 5251; SSE-NEXT: por %xmm0, %xmm11 5252; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,3,2,3,4,5,6,7] 5253; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] 5254; SSE-NEXT: pand %xmm12, %xmm0 5255; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] 5256; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] 5257; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] 5258; SSE-NEXT: packuswb %xmm0, %xmm0 5259; SSE-NEXT: movdqa %xmm10, %xmm2 5260; SSE-NEXT: pandn %xmm0, %xmm2 5261; SSE-NEXT: pand %xmm10, %xmm3 5262; SSE-NEXT: movdqa %xmm10, %xmm9 5263; SSE-NEXT: por %xmm3, %xmm2 5264; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5265; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 5266; SSE-NEXT: pand %xmm7, %xmm10 5267; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload 5268; SSE-NEXT: movdqa %xmm10, %xmm0 5269; SSE-NEXT: pand %xmm12, %xmm0 5270; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,7,6,7] 5271; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 5272; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7] 5273; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] 5274; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 5275; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] 5276; SSE-NEXT: packuswb %xmm2, %xmm0 5277; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5278; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 5279; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5280; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,1,2,3,4,5,6,7] 5281; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] 5282; SSE-NEXT: pand %xmm12, %xmm2 5283; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] 5284; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] 5285; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] 5286; SSE-NEXT: packuswb %xmm2, %xmm2 5287; SSE-NEXT: movdqa %xmm5, %xmm3 5288; SSE-NEXT: pandn %xmm2, %xmm3 5289; SSE-NEXT: pand %xmm5, %xmm0 5290; SSE-NEXT: por %xmm0, %xmm3 5291; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5292; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 5293; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 5294; SSE-NEXT: por %xmm0, %xmm8 5295; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,3,2,3,4,5,6,7] 5296; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] 5297; SSE-NEXT: pand %xmm12, %xmm0 5298; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] 5299; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] 5300; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] 5301; SSE-NEXT: packuswb %xmm0, %xmm0 5302; SSE-NEXT: movdqa %xmm9, %xmm2 5303; SSE-NEXT: pandn %xmm0, %xmm2 5304; SSE-NEXT: pand %xmm9, %xmm3 5305; SSE-NEXT: movdqa %xmm9, %xmm1 5306; SSE-NEXT: por %xmm3, %xmm2 5307; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5308; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5309; SSE-NEXT: pand %xmm7, %xmm0 5310; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 5311; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5312; SSE-NEXT: pand %xmm12, %xmm0 5313; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,7,6,7] 5314; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 5315; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7] 5316; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] 5317; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 5318; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] 5319; SSE-NEXT: packuswb %xmm2, %xmm0 5320; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 5321; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 5322; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[2,1,2,3,4,5,6,7] 5323; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] 5324; SSE-NEXT: pand %xmm12, %xmm2 5325; SSE-NEXT: movdqa %xmm12, %xmm9 5326; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] 5327; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] 5328; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] 5329; SSE-NEXT: packuswb %xmm2, %xmm2 5330; SSE-NEXT: movdqa %xmm5, %xmm3 5331; SSE-NEXT: pandn %xmm2, %xmm3 5332; SSE-NEXT: pand %xmm5, %xmm0 5333; SSE-NEXT: por %xmm0, %xmm3 5334; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5335; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 5336; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 5337; SSE-NEXT: por %xmm0, %xmm12 5338; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,3,2,3,4,5,6,7] 5339; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] 5340; SSE-NEXT: pand %xmm9, %xmm0 5341; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] 5342; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] 5343; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] 5344; SSE-NEXT: packuswb %xmm0, %xmm0 5345; SSE-NEXT: movdqa %xmm1, %xmm9 5346; SSE-NEXT: movdqa %xmm1, %xmm2 5347; SSE-NEXT: pandn %xmm0, %xmm2 5348; SSE-NEXT: pand %xmm1, %xmm3 5349; SSE-NEXT: por %xmm3, %xmm2 5350; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5351; SSE-NEXT: movdqa %xmm4, %xmm0 5352; SSE-NEXT: pxor %xmm1, %xmm1 5353; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 5354; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] 5355; SSE-NEXT: pxor %xmm7, %xmm7 5356; SSE-NEXT: movdqa %xmm4, %xmm2 5357; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0] 5358; SSE-NEXT: movaps %xmm0, %xmm3 5359; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2] 5360; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[0,0] 5361; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[2,3] 5362; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7] 5363; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] 5364; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 5365; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] 5366; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] 5367; SSE-NEXT: packuswb %xmm0, %xmm2 5368; SSE-NEXT: movdqa %xmm6, %xmm0 5369; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] 5370; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] 5371; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] 5372; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,0,65535,65535,65535,65535] 5373; SSE-NEXT: movdqa %xmm3, %xmm4 5374; SSE-NEXT: pandn %xmm0, %xmm4 5375; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] 5376; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,3,2,1] 5377; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] 5378; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] 5379; SSE-NEXT: pand %xmm3, %xmm0 5380; SSE-NEXT: por %xmm4, %xmm0 5381; SSE-NEXT: packuswb %xmm0, %xmm0 5382; SSE-NEXT: movdqa %xmm5, %xmm6 5383; SSE-NEXT: pandn %xmm0, %xmm6 5384; SSE-NEXT: pand %xmm5, %xmm2 5385; SSE-NEXT: por %xmm2, %xmm6 5386; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5387; SSE-NEXT: movdqa %xmm1, %xmm0 5388; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] 5389; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] 5390; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] 5391; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,65535,65535,0] 5392; SSE-NEXT: movdqa %xmm4, %xmm2 5393; SSE-NEXT: pandn %xmm0, %xmm2 5394; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] 5395; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,0,3] 5396; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] 5397; SSE-NEXT: pand %xmm4, %xmm0 5398; SSE-NEXT: por %xmm2, %xmm0 5399; SSE-NEXT: packuswb %xmm0, %xmm0 5400; SSE-NEXT: movdqa %xmm9, %xmm2 5401; SSE-NEXT: pandn %xmm0, %xmm2 5402; SSE-NEXT: pand %xmm9, %xmm6 5403; SSE-NEXT: por %xmm6, %xmm2 5404; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5405; SSE-NEXT: movdqa %xmm13, %xmm0 5406; SSE-NEXT: pxor %xmm1, %xmm1 5407; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 5408; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm1[8],xmm13[9],xmm1[9],xmm13[10],xmm1[10],xmm13[11],xmm1[11],xmm13[12],xmm1[12],xmm13[13],xmm1[13],xmm13[14],xmm1[14],xmm13[15],xmm1[15] 5409; SSE-NEXT: movdqa %xmm13, %xmm2 5410; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0] 5411; SSE-NEXT: movaps %xmm0, %xmm6 5412; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm2[0,2] 5413; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[0,0] 5414; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[2,3] 5415; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,7,5,6,7] 5416; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] 5417; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 5418; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] 5419; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] 5420; SSE-NEXT: packuswb %xmm0, %xmm2 5421; SSE-NEXT: movdqa %xmm14, %xmm0 5422; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 5423; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] 5424; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] 5425; SSE-NEXT: movdqa %xmm3, %xmm6 5426; SSE-NEXT: pandn %xmm0, %xmm6 5427; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm1[8],xmm14[9],xmm1[9],xmm14[10],xmm1[10],xmm14[11],xmm1[11],xmm14[12],xmm1[12],xmm14[13],xmm1[13],xmm14[14],xmm1[14],xmm14[15],xmm1[15] 5428; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,3,2,1] 5429; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] 5430; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] 5431; SSE-NEXT: pand %xmm3, %xmm0 5432; SSE-NEXT: por %xmm6, %xmm0 5433; SSE-NEXT: packuswb %xmm0, %xmm0 5434; SSE-NEXT: movdqa %xmm5, %xmm6 5435; SSE-NEXT: pandn %xmm0, %xmm6 5436; SSE-NEXT: pand %xmm5, %xmm2 5437; SSE-NEXT: por %xmm2, %xmm6 5438; SSE-NEXT: movdqa %xmm11, %xmm0 5439; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 5440; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] 5441; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] 5442; SSE-NEXT: movdqa %xmm4, %xmm2 5443; SSE-NEXT: pandn %xmm0, %xmm2 5444; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3],xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] 5445; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,2,0,3] 5446; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] 5447; SSE-NEXT: pand %xmm4, %xmm0 5448; SSE-NEXT: por %xmm2, %xmm0 5449; SSE-NEXT: packuswb %xmm0, %xmm0 5450; SSE-NEXT: movdqa %xmm9, %xmm2 5451; SSE-NEXT: pandn %xmm0, %xmm2 5452; SSE-NEXT: pand %xmm9, %xmm6 5453; SSE-NEXT: por %xmm6, %xmm2 5454; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5455; SSE-NEXT: movdqa %xmm10, %xmm0 5456; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 5457; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] 5458; SSE-NEXT: movdqa %xmm10, %xmm2 5459; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0] 5460; SSE-NEXT: movaps %xmm0, %xmm6 5461; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm2[0,2] 5462; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0] 5463; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[2,3] 5464; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,7,5,6,7] 5465; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] 5466; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 5467; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] 5468; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] 5469; SSE-NEXT: packuswb %xmm0, %xmm2 5470; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 5471; SSE-NEXT: movdqa %xmm7, %xmm0 5472; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 5473; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] 5474; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] 5475; SSE-NEXT: movdqa %xmm3, %xmm6 5476; SSE-NEXT: pandn %xmm0, %xmm6 5477; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] 5478; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,3,2,1] 5479; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] 5480; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] 5481; SSE-NEXT: pand %xmm3, %xmm0 5482; SSE-NEXT: por %xmm6, %xmm0 5483; SSE-NEXT: packuswb %xmm0, %xmm0 5484; SSE-NEXT: movdqa %xmm5, %xmm6 5485; SSE-NEXT: pandn %xmm0, %xmm6 5486; SSE-NEXT: pand %xmm5, %xmm2 5487; SSE-NEXT: por %xmm2, %xmm6 5488; SSE-NEXT: movdqa %xmm8, %xmm0 5489; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 5490; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] 5491; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] 5492; SSE-NEXT: movdqa %xmm4, %xmm2 5493; SSE-NEXT: pandn %xmm0, %xmm2 5494; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] 5495; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,2,0,3] 5496; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] 5497; SSE-NEXT: pand %xmm4, %xmm0 5498; SSE-NEXT: por %xmm2, %xmm0 5499; SSE-NEXT: packuswb %xmm0, %xmm2 5500; SSE-NEXT: movdqa %xmm9, %xmm0 5501; SSE-NEXT: pandn %xmm2, %xmm0 5502; SSE-NEXT: pand %xmm9, %xmm6 5503; SSE-NEXT: por %xmm6, %xmm0 5504; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5505; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5506; SSE-NEXT: movdqa %xmm0, %xmm2 5507; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 5508; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 5509; SSE-NEXT: movdqa %xmm0, %xmm6 5510; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[3,0] 5511; SSE-NEXT: movaps %xmm2, %xmm7 5512; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm6[0,2] 5513; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] 5514; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] 5515; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,7,5,6,7] 5516; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] 5517; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] 5518; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] 5519; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] 5520; SSE-NEXT: packuswb %xmm2, %xmm6 5521; SSE-NEXT: movdqa %xmm15, %xmm2 5522; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 5523; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] 5524; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] 5525; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15] 5526; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[0,3,2,1] 5527; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] 5528; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] 5529; SSE-NEXT: pand %xmm3, %xmm7 5530; SSE-NEXT: pandn %xmm2, %xmm3 5531; SSE-NEXT: por %xmm7, %xmm3 5532; SSE-NEXT: pand %xmm5, %xmm6 5533; SSE-NEXT: packuswb %xmm3, %xmm3 5534; SSE-NEXT: pandn %xmm3, %xmm5 5535; SSE-NEXT: por %xmm6, %xmm5 5536; SSE-NEXT: movdqa %xmm12, %xmm2 5537; SSE-NEXT: pxor %xmm0, %xmm0 5538; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 5539; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 5540; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] 5541; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] 5542; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,2,0,3] 5543; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] 5544; SSE-NEXT: pand %xmm4, %xmm3 5545; SSE-NEXT: pandn %xmm2, %xmm4 5546; SSE-NEXT: por %xmm3, %xmm4 5547; SSE-NEXT: pand %xmm9, %xmm5 5548; SSE-NEXT: packuswb %xmm4, %xmm2 5549; SSE-NEXT: pandn %xmm2, %xmm9 5550; SSE-NEXT: por %xmm5, %xmm9 5551; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5552; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5553; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 5554; SSE-NEXT: movdqa %xmm0, %xmm1 5555; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255] 5556; SSE-NEXT: pand %xmm10, %xmm1 5557; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,1,2,3] 5558; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] 5559; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] 5560; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] 5561; SSE-NEXT: packuswb %xmm2, %xmm1 5562; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] 5563; SSE-NEXT: movdqa %xmm15, %xmm2 5564; SSE-NEXT: pandn %xmm1, %xmm2 5565; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 5566; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 5567; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,0] 5568; SSE-NEXT: pand %xmm10, %xmm1 5569; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 5570; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] 5571; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[2,1,0,3,4,5,6,7] 5572; SSE-NEXT: packuswb %xmm6, %xmm6 5573; SSE-NEXT: pand %xmm15, %xmm6 5574; SSE-NEXT: por %xmm2, %xmm6 5575; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 5576; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,0,65535,65535,0] 5577; SSE-NEXT: pand %xmm11, %xmm13 5578; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 5579; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,2,1,3] 5580; SSE-NEXT: pand %xmm10, %xmm1 5581; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] 5582; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] 5583; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] 5584; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] 5585; SSE-NEXT: packuswb %xmm1, %xmm2 5586; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,0] 5587; SSE-NEXT: movdqa %xmm1, %xmm3 5588; SSE-NEXT: pandn %xmm2, %xmm3 5589; SSE-NEXT: pand %xmm1, %xmm6 5590; SSE-NEXT: por %xmm6, %xmm3 5591; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5592; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 5593; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 5594; SSE-NEXT: movdqa %xmm14, %xmm2 5595; SSE-NEXT: pand %xmm10, %xmm2 5596; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,1,2,3] 5597; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] 5598; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] 5599; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] 5600; SSE-NEXT: packuswb %xmm6, %xmm2 5601; SSE-NEXT: movdqa %xmm15, %xmm6 5602; SSE-NEXT: pandn %xmm2, %xmm6 5603; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5604; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 5605; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,1,2,0] 5606; SSE-NEXT: pand %xmm10, %xmm2 5607; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] 5608; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] 5609; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[2,1,0,3,4,5,6,7] 5610; SSE-NEXT: packuswb %xmm7, %xmm7 5611; SSE-NEXT: pand %xmm15, %xmm7 5612; SSE-NEXT: por %xmm6, %xmm7 5613; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5614; SSE-NEXT: pand %xmm11, %xmm3 5615; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 5616; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] 5617; SSE-NEXT: pand %xmm10, %xmm2 5618; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] 5619; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] 5620; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] 5621; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] 5622; SSE-NEXT: packuswb %xmm2, %xmm6 5623; SSE-NEXT: movdqa %xmm1, %xmm2 5624; SSE-NEXT: pandn %xmm6, %xmm2 5625; SSE-NEXT: pand %xmm1, %xmm7 5626; SSE-NEXT: por %xmm7, %xmm2 5627; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5628; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 5629; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 5630; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5631; SSE-NEXT: pand %xmm10, %xmm6 5632; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] 5633; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] 5634; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7] 5635; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] 5636; SSE-NEXT: packuswb %xmm7, %xmm6 5637; SSE-NEXT: movdqa %xmm15, %xmm7 5638; SSE-NEXT: pandn %xmm6, %xmm7 5639; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 5640; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 5641; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5642; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[3,1,2,0] 5643; SSE-NEXT: pand %xmm10, %xmm6 5644; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] 5645; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,0,3] 5646; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[2,1,0,3,4,5,6,7] 5647; SSE-NEXT: packuswb %xmm8, %xmm8 5648; SSE-NEXT: pand %xmm15, %xmm8 5649; SSE-NEXT: por %xmm7, %xmm8 5650; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 5651; SSE-NEXT: pand %xmm11, %xmm2 5652; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 5653; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5654; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,2,1,3] 5655; SSE-NEXT: pand %xmm10, %xmm6 5656; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,2,1,4,5,6,7] 5657; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] 5658; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] 5659; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,4,7] 5660; SSE-NEXT: packuswb %xmm6, %xmm7 5661; SSE-NEXT: movdqa %xmm1, %xmm2 5662; SSE-NEXT: pandn %xmm7, %xmm2 5663; SSE-NEXT: pand %xmm1, %xmm8 5664; SSE-NEXT: por %xmm8, %xmm2 5665; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5666; SSE-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload 5667; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 5668; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill 5669; SSE-NEXT: pand %xmm10, %xmm7 5670; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[2,1,2,3] 5671; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] 5672; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] 5673; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,2] 5674; SSE-NEXT: packuswb %xmm8, %xmm7 5675; SSE-NEXT: movdqa %xmm15, %xmm8 5676; SSE-NEXT: pandn %xmm7, %xmm8 5677; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 5678; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 5679; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[3,1,2,0] 5680; SSE-NEXT: pand %xmm10, %xmm7 5681; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,6,7] 5682; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,0,3] 5683; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm7[2,1,0,3,4,5,6,7] 5684; SSE-NEXT: packuswb %xmm9, %xmm9 5685; SSE-NEXT: pand %xmm15, %xmm9 5686; SSE-NEXT: por %xmm8, %xmm9 5687; SSE-NEXT: movdqa %xmm11, %xmm2 5688; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 5689; SSE-NEXT: pand %xmm11, %xmm7 5690; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 5691; SSE-NEXT: por %xmm7, %xmm2 5692; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,2,1,3] 5693; SSE-NEXT: pand %xmm10, %xmm7 5694; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,2,1,4,5,6,7] 5695; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] 5696; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] 5697; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] 5698; SSE-NEXT: packuswb %xmm7, %xmm8 5699; SSE-NEXT: movdqa %xmm1, %xmm7 5700; SSE-NEXT: pandn %xmm8, %xmm7 5701; SSE-NEXT: pand %xmm1, %xmm9 5702; SSE-NEXT: por %xmm9, %xmm7 5703; SSE-NEXT: movdqa %xmm0, %xmm8 5704; SSE-NEXT: pxor %xmm5, %xmm5 5705; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] 5706; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] 5707; SSE-NEXT: movdqa %xmm0, %xmm9 5708; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm8[0,0] 5709; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[2,3] 5710; SSE-NEXT: psrlq $48, %xmm8 5711; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 5712; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] 5713; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[3,1,2,3,4,5,6,7] 5714; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] 5715; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7] 5716; SSE-NEXT: packuswb %xmm9, %xmm8 5717; SSE-NEXT: movdqa %xmm15, %xmm10 5718; SSE-NEXT: pandn %xmm8, %xmm10 5719; SSE-NEXT: movdqa %xmm12, %xmm8 5720; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] 5721; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,2,3] 5722; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] 5723; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,65535] 5724; SSE-NEXT: movdqa %xmm0, %xmm11 5725; SSE-NEXT: pandn %xmm8, %xmm11 5726; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] 5727; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm12[0,1,2,3,7,5,6,7] 5728; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] 5729; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm8[3,1,1,2,4,5,6,7] 5730; SSE-NEXT: pand %xmm0, %xmm12 5731; SSE-NEXT: por %xmm11, %xmm12 5732; SSE-NEXT: packuswb %xmm12, %xmm12 5733; SSE-NEXT: pand %xmm15, %xmm12 5734; SSE-NEXT: por %xmm10, %xmm12 5735; SSE-NEXT: movdqa %xmm13, %xmm8 5736; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15] 5737; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,7,5,6,7] 5738; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] 5739; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,7,4] 5740; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,0,65535,0,0] 5741; SSE-NEXT: movdqa %xmm11, %xmm13 5742; SSE-NEXT: pandn %xmm10, %xmm13 5743; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] 5744; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,1,1] 5745; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] 5746; SSE-NEXT: pand %xmm11, %xmm8 5747; SSE-NEXT: por %xmm8, %xmm13 5748; SSE-NEXT: packuswb %xmm13, %xmm10 5749; SSE-NEXT: movdqa %xmm1, %xmm8 5750; SSE-NEXT: pandn %xmm10, %xmm8 5751; SSE-NEXT: pand %xmm1, %xmm12 5752; SSE-NEXT: por %xmm12, %xmm8 5753; SSE-NEXT: movdqa %xmm14, %xmm9 5754; SSE-NEXT: movdqa %xmm14, %xmm10 5755; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15] 5756; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] 5757; SSE-NEXT: movdqa %xmm9, %xmm12 5758; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm10[0,0] 5759; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm10[2,3] 5760; SSE-NEXT: psrlq $48, %xmm10 5761; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 5762; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] 5763; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm9[3,1,2,3,4,5,6,7] 5764; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] 5765; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,5,7] 5766; SSE-NEXT: packuswb %xmm12, %xmm10 5767; SSE-NEXT: movdqa %xmm15, %xmm12 5768; SSE-NEXT: pandn %xmm10, %xmm12 5769; SSE-NEXT: movdqa %xmm4, %xmm10 5770; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15] 5771; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3] 5772; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] 5773; SSE-NEXT: movdqa %xmm0, %xmm14 5774; SSE-NEXT: pandn %xmm10, %xmm14 5775; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 5776; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,7,5,6,7] 5777; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] 5778; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm10[3,1,1,2,4,5,6,7] 5779; SSE-NEXT: pand %xmm0, %xmm13 5780; SSE-NEXT: por %xmm14, %xmm13 5781; SSE-NEXT: packuswb %xmm13, %xmm13 5782; SSE-NEXT: pand %xmm15, %xmm13 5783; SSE-NEXT: por %xmm12, %xmm13 5784; SSE-NEXT: movdqa %xmm3, %xmm10 5785; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] 5786; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm3[0,1,2,3,7,5,6,7] 5787; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,0] 5788; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,7,4] 5789; SSE-NEXT: movdqa %xmm11, %xmm14 5790; SSE-NEXT: pandn %xmm12, %xmm14 5791; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] 5792; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,3,1,1] 5793; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] 5794; SSE-NEXT: pand %xmm11, %xmm10 5795; SSE-NEXT: por %xmm10, %xmm14 5796; SSE-NEXT: packuswb %xmm14, %xmm10 5797; SSE-NEXT: movdqa %xmm1, %xmm12 5798; SSE-NEXT: pandn %xmm10, %xmm12 5799; SSE-NEXT: pand %xmm1, %xmm13 5800; SSE-NEXT: por %xmm13, %xmm12 5801; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 5802; SSE-NEXT: movdqa %xmm9, %xmm10 5803; SSE-NEXT: pxor %xmm3, %xmm3 5804; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15] 5805; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] 5806; SSE-NEXT: pxor %xmm4, %xmm4 5807; SSE-NEXT: movdqa %xmm9, %xmm13 5808; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm10[0,0] 5809; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm10[2,3] 5810; SSE-NEXT: psrlq $48, %xmm10 5811; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 5812; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] 5813; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm9[3,1,2,3,4,5,6,7] 5814; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] 5815; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,5,7] 5816; SSE-NEXT: packuswb %xmm13, %xmm10 5817; SSE-NEXT: movdqa %xmm15, %xmm13 5818; SSE-NEXT: pandn %xmm10, %xmm13 5819; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5820; SSE-NEXT: movdqa %xmm3, %xmm10 5821; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] 5822; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3] 5823; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] 5824; SSE-NEXT: movdqa %xmm0, %xmm9 5825; SSE-NEXT: pandn %xmm10, %xmm9 5826; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 5827; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,7,5,6,7] 5828; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] 5829; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm10[3,1,1,2,4,5,6,7] 5830; SSE-NEXT: pand %xmm0, %xmm14 5831; SSE-NEXT: por %xmm9, %xmm14 5832; SSE-NEXT: packuswb %xmm14, %xmm14 5833; SSE-NEXT: pand %xmm15, %xmm14 5834; SSE-NEXT: por %xmm13, %xmm14 5835; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5836; SSE-NEXT: movdqa %xmm3, %xmm9 5837; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] 5838; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,7,5,6,7] 5839; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] 5840; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,7,4] 5841; SSE-NEXT: movdqa %xmm11, %xmm13 5842; SSE-NEXT: pandn %xmm10, %xmm13 5843; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] 5844; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,1,1] 5845; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] 5846; SSE-NEXT: pand %xmm11, %xmm9 5847; SSE-NEXT: por %xmm9, %xmm13 5848; SSE-NEXT: packuswb %xmm13, %xmm9 5849; SSE-NEXT: movdqa %xmm1, %xmm13 5850; SSE-NEXT: pandn %xmm9, %xmm13 5851; SSE-NEXT: pand %xmm1, %xmm14 5852; SSE-NEXT: por %xmm14, %xmm13 5853; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload 5854; SSE-NEXT: movdqa %xmm3, %xmm9 5855; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] 5856; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 5857; SSE-NEXT: movdqa %xmm3, %xmm10 5858; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm9[0,0] 5859; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm9[2,3] 5860; SSE-NEXT: psrlq $48, %xmm9 5861; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 5862; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] 5863; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm3[3,1,2,3,4,5,6,7] 5864; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] 5865; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,7] 5866; SSE-NEXT: packuswb %xmm10, %xmm9 5867; SSE-NEXT: movdqa %xmm6, %xmm10 5868; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15] 5869; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3] 5870; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] 5871; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] 5872; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm6[0,1,2,3,7,5,6,7] 5873; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3] 5874; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[3,1,1,2,4,5,6,7] 5875; SSE-NEXT: pand %xmm0, %xmm14 5876; SSE-NEXT: pandn %xmm10, %xmm0 5877; SSE-NEXT: por %xmm14, %xmm0 5878; SSE-NEXT: packuswb %xmm0, %xmm0 5879; SSE-NEXT: pand %xmm15, %xmm0 5880; SSE-NEXT: pandn %xmm9, %xmm15 5881; SSE-NEXT: por %xmm15, %xmm0 5882; SSE-NEXT: movdqa %xmm2, %xmm4 5883; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 5884; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] 5885; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,1,1] 5886; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] 5887; SSE-NEXT: pand %xmm11, %xmm4 5888; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,7,5,6,7] 5889; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] 5890; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,7,4] 5891; SSE-NEXT: pandn %xmm5, %xmm11 5892; SSE-NEXT: por %xmm4, %xmm11 5893; SSE-NEXT: pand %xmm1, %xmm0 5894; SSE-NEXT: packuswb %xmm11, %xmm4 5895; SSE-NEXT: pandn %xmm4, %xmm1 5896; SSE-NEXT: por %xmm0, %xmm1 5897; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5898; SSE-NEXT: movaps %xmm0, 16(%rsi) 5899; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5900; SSE-NEXT: movaps %xmm0, 32(%rsi) 5901; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5902; SSE-NEXT: movaps %xmm0, 48(%rsi) 5903; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5904; SSE-NEXT: movaps %xmm0, (%rsi) 5905; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5906; SSE-NEXT: movaps %xmm0, 16(%rdx) 5907; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5908; SSE-NEXT: movaps %xmm0, 32(%rdx) 5909; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5910; SSE-NEXT: movaps %xmm0, 48(%rdx) 5911; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5912; SSE-NEXT: movaps %xmm0, (%rdx) 5913; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5914; SSE-NEXT: movaps %xmm0, 16(%rcx) 5915; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5916; SSE-NEXT: movaps %xmm0, 32(%rcx) 5917; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5918; SSE-NEXT: movaps %xmm0, 48(%rcx) 5919; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5920; SSE-NEXT: movaps %xmm0, (%rcx) 5921; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5922; SSE-NEXT: movaps %xmm0, 16(%r8) 5923; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5924; SSE-NEXT: movaps %xmm0, 32(%r8) 5925; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5926; SSE-NEXT: movaps %xmm0, 48(%r8) 5927; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5928; SSE-NEXT: movaps %xmm0, (%r8) 5929; SSE-NEXT: movdqa %xmm7, 16(%r9) 5930; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5931; SSE-NEXT: movaps %xmm0, 32(%r9) 5932; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5933; SSE-NEXT: movaps %xmm0, 48(%r9) 5934; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5935; SSE-NEXT: movaps %xmm0, (%r9) 5936; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax 5937; SSE-NEXT: movdqa %xmm1, 16(%rax) 5938; SSE-NEXT: movdqa %xmm13, 32(%rax) 5939; SSE-NEXT: movdqa %xmm12, 48(%rax) 5940; SSE-NEXT: movdqa %xmm8, (%rax) 5941; SSE-NEXT: addq $792, %rsp # imm = 0x318 5942; SSE-NEXT: retq 5943; 5944; AVX-LABEL: load_i8_stride6_vf64: 5945; AVX: # %bb.0: 5946; AVX-NEXT: subq $616, %rsp # imm = 0x268 5947; AVX-NEXT: vmovdqa (%rdi), %xmm2 5948; AVX-NEXT: vmovdqa 16(%rdi), %xmm7 5949; AVX-NEXT: vmovdqa 32(%rdi), %xmm15 5950; AVX-NEXT: vmovdqa 48(%rdi), %xmm6 5951; AVX-NEXT: vmovdqa 224(%rdi), %xmm8 5952; AVX-NEXT: vmovdqa 240(%rdi), %xmm9 5953; AVX-NEXT: vmovdqa 208(%rdi), %xmm10 5954; AVX-NEXT: vmovdqa 192(%rdi), %xmm11 5955; AVX-NEXT: vmovq {{.*#+}} xmm3 = [128,128,128,4,10,0,0,0,0,0,0,0,0,0,0,0] 5956; AVX-NEXT: vmovq {{.*#+}} xmm12 = [2,8,14,128,128,0,0,0,0,0,0,0,0,0,0,0] 5957; AVX-NEXT: vmovd {{.*#+}} xmm13 = [2,8,14,0,0,0,0,0,0,0,0,0,0,0,0,0] 5958; AVX-NEXT: vpshufb %xmm13, %xmm6, %xmm0 5959; AVX-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12] 5960; AVX-NEXT: vpshufb %xmm14, %xmm15, %xmm1 5961; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5962; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 5963; AVX-NEXT: vpshufb %xmm3, %xmm7, %xmm0 5964; AVX-NEXT: vmovdqa %xmm2, %xmm4 5965; AVX-NEXT: vpshufb %xmm12, %xmm2, %xmm2 5966; AVX-NEXT: vpor %xmm0, %xmm2, %xmm2 5967; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u] 5968; AVX-NEXT: vpblendvb %xmm5, %xmm1, %xmm2, %xmm1 5969; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5970; AVX-NEXT: vpshufb %xmm13, %xmm9, %xmm1 5971; AVX-NEXT: vpshufb %xmm14, %xmm8, %xmm2 5972; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 5973; AVX-NEXT: vpshufb %xmm3, %xmm10, %xmm2 5974; AVX-NEXT: vpshufb %xmm12, %xmm11, %xmm3 5975; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 5976; AVX-NEXT: vpblendvb %xmm5, %xmm1, %xmm2, %xmm1 5977; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5978; AVX-NEXT: vmovd {{.*#+}} xmm13 = [3,9,15,0,0,0,0,0,0,0,0,0,0,0,0,0] 5979; AVX-NEXT: vpshufb %xmm13, %xmm6, %xmm1 5980; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13] 5981; AVX-NEXT: vpshufb %xmm0, %xmm15, %xmm2 5982; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 5983; AVX-NEXT: vmovq {{.*#+}} xmm15 = [128,128,128,5,11,0,0,0,0,0,0,0,0,0,0,0] 5984; AVX-NEXT: vmovq {{.*#+}} xmm12 = [3,9,15,128,128,0,0,0,0,0,0,0,0,0,0,0] 5985; AVX-NEXT: vpshufb %xmm15, %xmm7, %xmm2 5986; AVX-NEXT: vmovdqa %xmm7, %xmm14 5987; AVX-NEXT: vpshufb %xmm12, %xmm4, %xmm3 5988; AVX-NEXT: vmovdqa %xmm4, %xmm7 5989; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 5990; AVX-NEXT: vpblendvb %xmm5, %xmm1, %xmm2, %xmm1 5991; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5992; AVX-NEXT: vpshufb %xmm13, %xmm9, %xmm1 5993; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5994; AVX-NEXT: vpshufb %xmm0, %xmm8, %xmm2 5995; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 5996; AVX-NEXT: vpshufb %xmm15, %xmm10, %xmm2 5997; AVX-NEXT: vpshufb %xmm12, %xmm11, %xmm3 5998; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 5999; AVX-NEXT: vpblendvb %xmm5, %xmm1, %xmm2, %xmm0 6000; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6001; AVX-NEXT: vmovq {{.*#+}} xmm12 = [4,10,128,128,128,0,0,0,0,0,0,0,0,0,0,0] 6002; AVX-NEXT: vpshufb %xmm12, %xmm4, %xmm0 6003; AVX-NEXT: vmovq {{.*#+}} xmm13 = [128,128,0,6,12,0,0,0,0,0,0,0,0,0,0,0] 6004; AVX-NEXT: vpshufb %xmm13, %xmm14, %xmm1 6005; AVX-NEXT: vpor %xmm0, %xmm1, %xmm1 6006; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [4,10,0,0,4,10,0,0,4,10,0,0,4,10,0,0] 6007; AVX-NEXT: vpshufb %xmm4, %xmm6, %xmm0 6008; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,2,8,14,0,2,8,14,0,2,8,14,0,2,8,14] 6009; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 6010; AVX-NEXT: vpshufb %xmm3, %xmm5, %xmm2 6011; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] 6012; AVX-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] 6013; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 6014; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6015; AVX-NEXT: vpshufb %xmm12, %xmm11, %xmm1 6016; AVX-NEXT: vpshufb %xmm13, %xmm10, %xmm2 6017; AVX-NEXT: vmovdqa %xmm10, %xmm12 6018; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 6019; AVX-NEXT: vpshufb %xmm4, %xmm9, %xmm2 6020; AVX-NEXT: vmovdqa %xmm9, %xmm15 6021; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6022; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm3 6023; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] 6024; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 6025; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6026; AVX-NEXT: vmovq {{.*#+}} xmm8 = [5,11,128,128,128,0,0,0,0,0,0,0,0,0,0,0] 6027; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6028; AVX-NEXT: vpshufb %xmm8, %xmm7, %xmm1 6029; AVX-NEXT: vmovq {{.*#+}} xmm3 = [128,128,1,7,13,0,0,0,0,0,0,0,0,0,0,0] 6030; AVX-NEXT: vmovdqa %xmm14, %xmm13 6031; AVX-NEXT: vpshufb %xmm3, %xmm14, %xmm2 6032; AVX-NEXT: vmovdqa %xmm3, %xmm14 6033; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 6034; AVX-NEXT: vbroadcastss {{.*#+}} xmm9 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0] 6035; AVX-NEXT: vpshufb %xmm9, %xmm6, %xmm2 6036; AVX-NEXT: vmovdqa %xmm6, %xmm10 6037; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6038; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,3,9,15,0,3,9,15,0,3,9,15,0,3,9,15] 6039; AVX-NEXT: vpshufb %xmm4, %xmm5, %xmm3 6040; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] 6041; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 6042; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6043; AVX-NEXT: vpshufb %xmm8, %xmm11, %xmm1 6044; AVX-NEXT: vmovdqa %xmm11, %xmm8 6045; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6046; AVX-NEXT: vpshufb %xmm14, %xmm12, %xmm2 6047; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6048; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 6049; AVX-NEXT: vpshufb %xmm9, %xmm15, %xmm2 6050; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 6051; AVX-NEXT: vpshufb %xmm4, %xmm6, %xmm3 6052; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] 6053; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 6054; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6055; AVX-NEXT: vmovq {{.*#+}} xmm3 = [128,128,128,2,8,14,0,0,0,0,0,0,0,0,0,0] 6056; AVX-NEXT: vmovdqa 112(%rdi), %xmm0 6057; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6058; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 6059; AVX-NEXT: vmovq {{.*#+}} xmm15 = [0,6,12,128,128,128,0,0,0,0,0,0,0,0,0,0] 6060; AVX-NEXT: vmovdqa 96(%rdi), %xmm1 6061; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6062; AVX-NEXT: vpshufb %xmm15, %xmm1, %xmm1 6063; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 6064; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10] 6065; AVX-NEXT: # xmm11 = mem[0,0] 6066; AVX-NEXT: vmovdqa 80(%rdi), %xmm1 6067; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6068; AVX-NEXT: vpshufb %xmm11, %xmm1, %xmm1 6069; AVX-NEXT: vmovdqa 64(%rdi), %xmm2 6070; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6071; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero 6072; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 6073; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 6074; AVX-NEXT: vmovd {{.*#+}} xmm14 = [0,0,4,10,0,0,0,0,0,0,0,0,0,0,0,0] 6075; AVX-NEXT: vpshufb %xmm14, %xmm5, %xmm4 6076; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,6,12,0,0,6,12,0,0,6,12,0,0,6,12,0] 6077; AVX-NEXT: vpshufb %xmm1, %xmm10, %xmm5 6078; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 6079; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6080; AVX-NEXT: vpshufb %xmm3, %xmm13, %xmm5 6081; AVX-NEXT: vpshufb %xmm15, %xmm7, %xmm9 6082; AVX-NEXT: vpor %xmm5, %xmm9, %xmm5 6083; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7] 6084; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] 6085; AVX-NEXT: vandnps %ymm2, %ymm13, %ymm2 6086; AVX-NEXT: vandps %ymm4, %ymm13, %ymm4 6087; AVX-NEXT: vorps %ymm2, %ymm4, %ymm9 6088; AVX-NEXT: vmovdqa 128(%rdi), %xmm2 6089; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6090; AVX-NEXT: vpshufb %xmm14, %xmm2, %xmm2 6091; AVX-NEXT: vmovdqa 144(%rdi), %xmm4 6092; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6093; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm4 6094; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] 6095; AVX-NEXT: vmovdqa 176(%rdi), %xmm2 6096; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6097; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm2 6098; AVX-NEXT: vmovdqa 160(%rdi), %xmm5 6099; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6100; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128] 6101; AVX-NEXT: # xmm0 = mem[0,0] 6102; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm10 6103; AVX-NEXT: vpor %xmm2, %xmm10, %xmm10 6104; AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] 6105; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm10, %xmm10 6106; AVX-NEXT: vmovdqa %ymm2, %ymm5 6107; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] 6108; AVX-NEXT: vandps %ymm7, %ymm9, %ymm9 6109; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 6110; AVX-NEXT: vandnps %ymm10, %ymm7, %ymm10 6111; AVX-NEXT: vorps %ymm10, %ymm9, %ymm4 6112; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6113; AVX-NEXT: vmovdqa 304(%rdi), %xmm2 6114; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6115; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm9 6116; AVX-NEXT: vmovdqa 288(%rdi), %xmm2 6117; AVX-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill 6118; AVX-NEXT: vpshufb %xmm15, %xmm2, %xmm10 6119; AVX-NEXT: vpor %xmm9, %xmm10, %xmm9 6120; AVX-NEXT: vmovdqa 272(%rdi), %xmm2 6121; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6122; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm10 6123; AVX-NEXT: vmovdqa 256(%rdi), %xmm2 6124; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6125; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm11 6126; AVX-NEXT: vpor %xmm10, %xmm11, %xmm10 6127; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 6128; AVX-NEXT: vpshufb %xmm14, %xmm6, %xmm10 6129; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6130; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm11 6131; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] 6132; AVX-NEXT: vpshufb %xmm3, %xmm12, %xmm3 6133; AVX-NEXT: vpshufb %xmm15, %xmm8, %xmm6 6134; AVX-NEXT: vpor %xmm3, %xmm6, %xmm3 6135; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm10[3,4,5],xmm3[6,7] 6136; AVX-NEXT: vandnps %ymm9, %ymm13, %ymm6 6137; AVX-NEXT: vandps %ymm3, %ymm13, %ymm3 6138; AVX-NEXT: vmovaps %ymm13, %ymm11 6139; AVX-NEXT: vorps %ymm6, %ymm3, %ymm3 6140; AVX-NEXT: vmovdqa 320(%rdi), %xmm4 6141; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6142; AVX-NEXT: vpshufb %xmm14, %xmm4, %xmm0 6143; AVX-NEXT: vmovdqa 336(%rdi), %xmm10 6144; AVX-NEXT: vpshufb %xmm1, %xmm10, %xmm1 6145; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6146; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 6147; AVX-NEXT: vmovdqa 368(%rdi), %xmm1 6148; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6149; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[4,10] 6150; AVX-NEXT: vmovdqa 352(%rdi), %xmm4 6151; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6152; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero 6153; AVX-NEXT: vpor %xmm1, %xmm6, %xmm1 6154; AVX-NEXT: vpblendvb %xmm5, %xmm0, %xmm1, %xmm0 6155; AVX-NEXT: vandps %ymm7, %ymm3, %ymm1 6156; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 6157; AVX-NEXT: vandnps %ymm0, %ymm7, %ymm0 6158; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 6159; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6160; AVX-NEXT: vmovq {{.*#+}} xmm8 = [128,128,128,3,9,15,0,0,0,0,0,0,0,0,0,0] 6161; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6162; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm0 6163; AVX-NEXT: vmovq {{.*#+}} xmm4 = [1,7,13,128,128,128,0,0,0,0,0,0,0,0,0,0] 6164; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6165; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1 6166; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 6167; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6168; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[5,11] 6169; AVX-NEXT: vmovddup {{.*#+}} xmm9 = [0,0,0,3,9,15,128,128,0,0,0,3,9,15,128,128] 6170; AVX-NEXT: # xmm9 = mem[0,0] 6171; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 6172; AVX-NEXT: vpshufb %xmm9, %xmm5, %xmm14 6173; AVX-NEXT: vpor %xmm1, %xmm14, %xmm1 6174; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm14 6175; AVX-NEXT: vmovd {{.*#+}} xmm3 = [0,0,5,11,0,0,0,0,0,0,0,0,0,0,0,0] 6176; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6177; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 6178; AVX-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0] 6179; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 6180; AVX-NEXT: vpshufb %xmm7, %xmm6, %xmm15 6181; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] 6182; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 6183; AVX-NEXT: vpshufb %xmm8, %xmm6, %xmm15 6184; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 6185; AVX-NEXT: vpshufb %xmm4, %xmm6, %xmm13 6186; AVX-NEXT: vpor %xmm15, %xmm13, %xmm13 6187; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3,4,5],xmm13[6,7] 6188; AVX-NEXT: vandnps %ymm14, %ymm11, %ymm13 6189; AVX-NEXT: vandps %ymm0, %ymm11, %ymm0 6190; AVX-NEXT: vorps %ymm0, %ymm13, %ymm0 6191; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 6192; AVX-NEXT: vpshufb %xmm3, %xmm12, %xmm13 6193; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 6194; AVX-NEXT: vpshufb %xmm7, %xmm11, %xmm14 6195; AVX-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] 6196; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 6197; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,128,128,128,5,11,0,0,0,128,128,128,5,11] 6198; AVX-NEXT: # xmm1 = mem[0,0] 6199; AVX-NEXT: vpshufb %xmm1, %xmm6, %xmm14 6200; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 6201; AVX-NEXT: vpshufb %xmm9, %xmm15, %xmm15 6202; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14 6203; AVX-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] 6204; AVX-NEXT: vpblendvb %xmm15, %xmm13, %xmm14, %xmm13 6205; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] 6206; AVX-NEXT: vandps %ymm0, %ymm15, %ymm0 6207; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 6208; AVX-NEXT: vandnps %ymm13, %ymm15, %ymm13 6209; AVX-NEXT: vorps %ymm0, %ymm13, %ymm0 6210; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6211; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6212; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm0 6213; AVX-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload 6214; AVX-NEXT: vpshufb %xmm4, %xmm13, %xmm13 6215; AVX-NEXT: vpor %xmm0, %xmm13, %xmm0 6216; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 6217; AVX-NEXT: vpshufb %xmm1, %xmm13, %xmm13 6218; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 6219; AVX-NEXT: vpshufb %xmm9, %xmm14, %xmm14 6220; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13 6221; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 6222; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 6223; AVX-NEXT: vpshufb %xmm3, %xmm13, %xmm13 6224; AVX-NEXT: vpshufb %xmm7, %xmm2, %xmm14 6225; AVX-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] 6226; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6227; AVX-NEXT: vpshufb %xmm8, %xmm2, %xmm8 6228; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6229; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4 6230; AVX-NEXT: vpor %xmm4, %xmm8, %xmm4 6231; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm13[3,4,5],xmm4[6,7] 6232; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] 6233; AVX-NEXT: vandnps %ymm0, %ymm1, %ymm0 6234; AVX-NEXT: vandps %ymm1, %ymm4, %ymm4 6235; AVX-NEXT: vorps %ymm0, %ymm4, %ymm0 6236; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6237; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm3 6238; AVX-NEXT: vpshufb %xmm7, %xmm10, %xmm1 6239; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 6240; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 6241; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,11] 6242; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 6243; AVX-NEXT: vpshufb %xmm9, %xmm4, %xmm4 6244; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 6245; AVX-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] 6246; AVX-NEXT: vpblendvb %xmm8, %xmm1, %xmm3, %xmm1 6247; AVX-NEXT: vandps %ymm0, %ymm15, %ymm0 6248; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 6249; AVX-NEXT: vandnps %ymm1, %ymm15, %ymm1 6250; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 6251; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6252; AVX-NEXT: vmovq {{.*#+}} xmm9 = [128,128,128,4,10,0,0,0,0,0,0,0,0,0,0,0] 6253; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 6254; AVX-NEXT: vpshufb %xmm9, %xmm10, %xmm0 6255; AVX-NEXT: vmovq {{.*#+}} xmm13 = [2,8,14,128,128,0,0,0,0,0,0,0,0,0,0,0] 6256; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6257; AVX-NEXT: vpshufb %xmm13, %xmm1, %xmm1 6258; AVX-NEXT: vpor %xmm0, %xmm1, %xmm1 6259; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,4,10,128,128,128,0,0,0,4,10,128,128,128] 6260; AVX-NEXT: # xmm0 = mem[0,0] 6261; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm4 6262; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,0,6,12,0,0,0,128,128,0,6,12] 6263; AVX-NEXT: # xmm3 = mem[0,0] 6264; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 6265; AVX-NEXT: vpshufb %xmm3, %xmm5, %xmm5 6266; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 6267; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 6268; AVX-NEXT: vmovdqa %ymm8, %ymm9 6269; AVX-NEXT: vandnps %ymm1, %ymm8, %ymm1 6270; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload 6271; AVX-NEXT: vorps %ymm1, %ymm4, %ymm4 6272; AVX-NEXT: vmovd {{.*#+}} xmm13 = [2,8,14,0,0,0,0,0,0,0,0,0,0,0,0,0] 6273; AVX-NEXT: vpshufb %xmm13, %xmm11, %xmm1 6274; AVX-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12] 6275; AVX-NEXT: vpshufb %xmm8, %xmm12, %xmm5 6276; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] 6277; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 6278; AVX-NEXT: vpshufb %xmm0, %xmm7, %xmm5 6279; AVX-NEXT: vmovdqa %xmm6, %xmm15 6280; AVX-NEXT: vpshufb %xmm3, %xmm6, %xmm6 6281; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 6282; AVX-NEXT: vpblendvb %xmm9, %xmm1, %xmm5, %xmm5 6283; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] 6284; AVX-NEXT: vandps %ymm1, %ymm4, %ymm4 6285; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 6286; AVX-NEXT: vandnps %ymm5, %ymm1, %ymm5 6287; AVX-NEXT: vorps %ymm5, %ymm4, %ymm1 6288; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6289; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 6290; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm12[4,10,u,u,u,u,u,u,u,u,u,u,u] 6291; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 6292; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] 6293; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 6294; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 6295; AVX-NEXT: vpshufb %xmm0, %xmm11, %xmm5 6296; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 6297; AVX-NEXT: vpshufb %xmm3, %xmm6, %xmm6 6298; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 6299; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 6300; AVX-NEXT: vandnps %ymm4, %ymm9, %ymm4 6301; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload 6302; AVX-NEXT: vorps %ymm4, %ymm5, %ymm4 6303; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6304; AVX-NEXT: vpshufb %xmm13, %xmm1, %xmm5 6305; AVX-NEXT: vpshufb %xmm8, %xmm2, %xmm6 6306; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0] 6307; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6308; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm0 6309; AVX-NEXT: vpshufb %xmm3, %xmm14, %xmm3 6310; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0 6311; AVX-NEXT: vpblendvb %xmm9, %xmm5, %xmm0, %xmm0 6312; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] 6313; AVX-NEXT: vandps %ymm5, %ymm4, %ymm3 6314; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 6315; AVX-NEXT: vandnps %ymm0, %ymm5, %ymm0 6316; AVX-NEXT: vorps %ymm0, %ymm3, %ymm0 6317; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6318; AVX-NEXT: vmovq {{.*#+}} xmm14 = [128,128,128,5,11,0,0,0,0,0,0,0,0,0,0,0] 6319; AVX-NEXT: vpshufb %xmm14, %xmm10, %xmm0 6320; AVX-NEXT: vmovq {{.*#+}} xmm13 = [3,9,15,128,128,0,0,0,0,0,0,0,0,0,0,0] 6321; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 6322; AVX-NEXT: vpshufb %xmm13, %xmm14, %xmm3 6323; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0 6324; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,5,11,128,128,128,0,0,0,5,11,128,128,128] 6325; AVX-NEXT: # xmm3 = mem[0,0] 6326; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 6327; AVX-NEXT: vpshufb %xmm3, %xmm4, %xmm4 6328; AVX-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,128,128,1,7,13,0,0,0,128,128,1,7,13] 6329; AVX-NEXT: # xmm5 = mem[0,0] 6330; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 6331; AVX-NEXT: vpshufb %xmm5, %xmm10, %xmm6 6332; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 6333; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 6334; AVX-NEXT: vmovdqa %ymm9, %ymm13 6335; AVX-NEXT: vandnps %ymm0, %ymm9, %ymm0 6336; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload 6337; AVX-NEXT: vorps %ymm0, %ymm4, %ymm0 6338; AVX-NEXT: vmovd {{.*#+}} xmm8 = [3,9,15,0,0,0,0,0,0,0,0,0,0,0,0,0] 6339; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 6340; AVX-NEXT: vpshufb %xmm8, %xmm4, %xmm4 6341; AVX-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13] 6342; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 6343; AVX-NEXT: vpshufb %xmm9, %xmm6, %xmm6 6344; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] 6345; AVX-NEXT: vpshufb %xmm3, %xmm7, %xmm6 6346; AVX-NEXT: vpshufb %xmm5, %xmm15, %xmm7 6347; AVX-NEXT: vmovdqa %xmm15, %xmm8 6348; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6 6349; AVX-NEXT: vpblendvb %xmm13, %xmm4, %xmm6, %xmm4 6350; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] 6351; AVX-NEXT: vandps %ymm7, %ymm0, %ymm0 6352; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 6353; AVX-NEXT: vandnps %ymm4, %ymm7, %ymm4 6354; AVX-NEXT: vorps %ymm4, %ymm0, %ymm0 6355; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6356; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm12[5,11,u,u,u,u,u,u,u,u,u,u,u] 6357; AVX-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload 6358; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[3,9,15],zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u,u] 6359; AVX-NEXT: vpor %xmm0, %xmm4, %xmm0 6360; AVX-NEXT: vpshufb %xmm3, %xmm11, %xmm4 6361; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 6362; AVX-NEXT: vpshufb %xmm5, %xmm15, %xmm6 6363; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 6364; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 6365; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u] 6366; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6367; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm6 6368; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] 6369; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm3 6370; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 6371; AVX-NEXT: vpshufb %xmm5, %xmm11, %xmm5 6372; AVX-NEXT: vpor %xmm3, %xmm5, %xmm3 6373; AVX-NEXT: vpblendvb %xmm13, %xmm4, %xmm3, %xmm3 6374; AVX-NEXT: vandnps %ymm0, %ymm13, %ymm0 6375; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload 6376; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0 6377; AVX-NEXT: vandps %ymm7, %ymm0, %ymm0 6378; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2 6379; AVX-NEXT: vandnps %ymm2, %ymm7, %ymm2 6380; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0 6381; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6382; AVX-NEXT: vmovq {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,0,0,0,0,0,0,0,0] 6383; AVX-NEXT: vpshufb %xmm1, %xmm14, %xmm0 6384; AVX-NEXT: vmovq {{.*#+}} xmm14 = [128,128,0,6,12,0,0,0,0,0,0,0,0,0,0,0] 6385; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 6386; AVX-NEXT: vpshufb %xmm14, %xmm13, %xmm2 6387; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 6388; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,128,128,128,2,8,14,0,0,128,128,128,2,8,14] 6389; AVX-NEXT: # xmm3 = mem[0,0] 6390; AVX-NEXT: vpshufb %xmm3, %xmm10, %xmm2 6391; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,6,12,128,128,128,0,0,0,6,12,128,128,128] 6392; AVX-NEXT: # xmm4 = mem[0,0] 6393; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 6394; AVX-NEXT: vpshufb %xmm4, %xmm5, %xmm5 6395; AVX-NEXT: vpor %xmm2, %xmm5, %xmm2 6396; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 6397; AVX-NEXT: vmovaps {{.*#+}} ymm10 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 6398; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload 6399; AVX-NEXT: vandps %ymm2, %ymm10, %ymm2 6400; AVX-NEXT: vorps %ymm5, %ymm2, %ymm2 6401; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm5 6402; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6403; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm6 6404; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 6405; AVX-NEXT: vbroadcastss {{.*#+}} xmm8 = [4,10,0,0,4,10,0,0,4,10,0,0,4,10,0,0] 6406; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 6407; AVX-NEXT: vpshufb %xmm8, %xmm14, %xmm6 6408; AVX-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,2,8,14,0,2,8,14,0,2,8,14,0,2,8,14] 6409; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6410; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm7 6411; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1] 6412; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7] 6413; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] 6414; AVX-NEXT: vandps %ymm0, %ymm2, %ymm2 6415; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 6416; AVX-NEXT: vandnps %ymm5, %ymm0, %ymm5 6417; AVX-NEXT: vorps %ymm5, %ymm2, %ymm2 6418; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6419; AVX-NEXT: vpshufb %xmm1, %xmm12, %xmm5 6420; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6421; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm1[0,6,12,u,u,u,u,u,u,u,u,u,u,u] 6422; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 6423; AVX-NEXT: vpshufb %xmm3, %xmm15, %xmm6 6424; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6425; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm7 6426; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6 6427; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 6428; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload 6429; AVX-NEXT: vandps %ymm5, %ymm10, %ymm5 6430; AVX-NEXT: vorps %ymm6, %ymm5, %ymm5 6431; AVX-NEXT: vpshufb %xmm3, %xmm11, %xmm3 6432; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 6433; AVX-NEXT: vpshufb %xmm4, %xmm11, %xmm4 6434; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 6435; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 6436; AVX-NEXT: vpshufb %xmm8, %xmm12, %xmm4 6437; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6438; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm6 6439; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm6[1],xmm4[1] 6440; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4],xmm3[5,6,7] 6441; AVX-NEXT: vandps %ymm0, %ymm5, %ymm4 6442; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 6443; AVX-NEXT: vandnps %ymm3, %ymm0, %ymm3 6444; AVX-NEXT: vorps %ymm3, %ymm4, %ymm2 6445; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6446; AVX-NEXT: vmovq {{.*#+}} xmm15 = [5,11,128,128,128,0,0,0,0,0,0,0,0,0,0,0] 6447; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6448; AVX-NEXT: vpshufb %xmm15, %xmm2, %xmm3 6449; AVX-NEXT: vmovq {{.*#+}} xmm4 = [128,128,1,7,13,0,0,0,0,0,0,0,0,0,0,0] 6450; AVX-NEXT: vpshufb %xmm4, %xmm13, %xmm5 6451; AVX-NEXT: vpor %xmm3, %xmm5, %xmm3 6452; AVX-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15] 6453; AVX-NEXT: # xmm5 = mem[0,0] 6454; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6455; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm7 6456; AVX-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,1,7,13,128,128,128,0,0,1,7,13,128,128,128] 6457; AVX-NEXT: # xmm6 = mem[0,0] 6458; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6459; AVX-NEXT: vpshufb %xmm6, %xmm2, %xmm8 6460; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7 6461; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm3 6462; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload 6463; AVX-NEXT: vandps %ymm3, %ymm10, %ymm3 6464; AVX-NEXT: vorps %ymm7, %ymm3, %ymm3 6465; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6466; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm7 6467; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6468; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm8 6469; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7 6470; AVX-NEXT: vbroadcastss {{.*#+}} xmm13 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0] 6471; AVX-NEXT: vpshufb %xmm13, %xmm14, %xmm8 6472; AVX-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,3,9,15,0,3,9,15,0,3,9,15,0,3,9,15] 6473; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6474; AVX-NEXT: vpshufb %xmm14, %xmm0, %xmm9 6475; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm9[1],xmm8[1] 6476; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4],xmm7[5,6,7] 6477; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] 6478; AVX-NEXT: vandps %ymm2, %ymm3, %ymm3 6479; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 6480; AVX-NEXT: vandnps %ymm7, %ymm2, %ymm7 6481; AVX-NEXT: vorps %ymm7, %ymm3, %ymm3 6482; AVX-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload 6483; AVX-NEXT: vpshufb %xmm15, %xmm7, %xmm7 6484; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 6485; AVX-NEXT: vpshufb %xmm4, %xmm8, %xmm8 6486; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7 6487; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 6488; AVX-NEXT: vpshufb %xmm5, %xmm4, %xmm8 6489; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 6490; AVX-NEXT: vpshufb %xmm6, %xmm4, %xmm9 6491; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8 6492; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 6493; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload 6494; AVX-NEXT: vandps %ymm7, %ymm10, %ymm0 6495; AVX-NEXT: vorps %ymm0, %ymm8, %ymm0 6496; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 6497; AVX-NEXT: vpshufb %xmm5, %xmm4, %xmm5 6498; AVX-NEXT: vpshufb %xmm6, %xmm11, %xmm6 6499; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 6500; AVX-NEXT: vpshufb %xmm13, %xmm12, %xmm6 6501; AVX-NEXT: vpshufb %xmm14, %xmm1, %xmm7 6502; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1] 6503; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7] 6504; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 6505; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 6506; AVX-NEXT: vandnps %ymm5, %ymm2, %ymm1 6507; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 6508; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6509; AVX-NEXT: vmovaps %ymm1, 32(%rsi) 6510; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6511; AVX-NEXT: vmovaps %ymm1, (%rsi) 6512; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6513; AVX-NEXT: vmovaps %ymm1, 32(%rdx) 6514; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6515; AVX-NEXT: vmovaps %ymm1, (%rdx) 6516; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6517; AVX-NEXT: vmovaps %ymm1, 32(%rcx) 6518; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6519; AVX-NEXT: vmovaps %ymm1, (%rcx) 6520; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6521; AVX-NEXT: vmovaps %ymm1, 32(%r8) 6522; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6523; AVX-NEXT: vmovaps %ymm1, (%r8) 6524; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6525; AVX-NEXT: vmovaps %ymm1, 32(%r9) 6526; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6527; AVX-NEXT: vmovaps %ymm1, (%r9) 6528; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax 6529; AVX-NEXT: vmovaps %ymm0, 32(%rax) 6530; AVX-NEXT: vmovaps %ymm3, (%rax) 6531; AVX-NEXT: addq $616, %rsp # imm = 0x268 6532; AVX-NEXT: vzeroupper 6533; AVX-NEXT: retq 6534; 6535; AVX2-LABEL: load_i8_stride6_vf64: 6536; AVX2: # %bb.0: 6537; AVX2-NEXT: subq $328, %rsp # imm = 0x148 6538; AVX2-NEXT: vmovdqa 192(%rdi), %ymm7 6539; AVX2-NEXT: vmovdqa (%rdi), %ymm3 6540; AVX2-NEXT: vmovdqa 32(%rdi), %ymm5 6541; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6542; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 6543; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 6544; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] 6545; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[0,1],ymm1[0,1] 6546; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6547; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 6548; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6549; AVX2-NEXT: vpblendvb %ymm13, %ymm2, %ymm0, %ymm4 6550; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] 6551; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] 6552; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm2 6553; AVX2-NEXT: vmovdqa %ymm3, %ymm5 6554; AVX2-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill 6555; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] 6556; AVX2-NEXT: vpshufb %xmm10, %xmm2, %xmm9 6557; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 6558; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] 6559; AVX2-NEXT: vpshufb %xmm11, %xmm3, %xmm12 6560; AVX2-NEXT: vpor %xmm9, %xmm12, %xmm9 6561; AVX2-NEXT: vpmovsxdq {{.*#+}} xmm12 = [18446744073709551615,16777215] 6562; AVX2-NEXT: vpblendvb %ymm12, %ymm9, %ymm0, %ymm0 6563; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6564; AVX2-NEXT: vmovdqa 224(%rdi), %ymm8 6565; AVX2-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm14 6566; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6567; AVX2-NEXT: vpshufb %xmm10, %xmm14, %xmm0 6568; AVX2-NEXT: vextracti128 $1, %ymm14, %xmm15 6569; AVX2-NEXT: vpshufb %xmm11, %xmm15, %xmm10 6570; AVX2-NEXT: vpor %xmm0, %xmm10, %xmm1 6571; AVX2-NEXT: vmovdqa 288(%rdi), %ymm11 6572; AVX2-NEXT: vmovdqa 256(%rdi), %ymm0 6573; AVX2-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm0[0,1],ymm11[0,1] 6574; AVX2-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm0[2,3],ymm11[2,3] 6575; AVX2-NEXT: vpblendvb %ymm13, %ymm9, %ymm11, %ymm13 6576; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] 6577; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 6578; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6579; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u] 6580; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm1 6581; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u] 6582; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm3 6583; AVX2-NEXT: vpor %xmm1, %xmm3, %xmm1 6584; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11] 6585; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm4 6586; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 6587; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6588; AVX2-NEXT: vpshufb %xmm0, %xmm14, %xmm0 6589; AVX2-NEXT: vpshufb %xmm2, %xmm15, %xmm1 6590; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 6591; AVX2-NEXT: vpshufb %ymm3, %ymm13, %ymm1 6592; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 6593; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6594; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] 6595; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 6596; AVX2-NEXT: vpblendvb %ymm13, %ymm10, %ymm5, %ymm1 6597; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 6598; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] 6599; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm3 6600; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] 6601; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm5 6602; AVX2-NEXT: vpor %xmm3, %xmm5, %xmm5 6603; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] 6604; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0] 6605; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6606; AVX2-NEXT: vpblendvb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 6607; AVX2-NEXT: vpshufb %ymm14, %ymm3, %ymm15 6608; AVX2-NEXT: vpblendvb %ymm12, %ymm5, %ymm15, %ymm5 6609; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6610; AVX2-NEXT: vpblendvb %ymm13, %ymm8, %ymm7, %ymm5 6611; AVX2-NEXT: vmovdqa %ymm8, %ymm7 6612; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm15 6613; AVX2-NEXT: vpshufb %xmm6, %xmm15, %xmm6 6614; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm4 6615; AVX2-NEXT: vpor %xmm6, %xmm4, %xmm4 6616; AVX2-NEXT: vpblendvb %ymm0, %ymm11, %ymm9, %ymm0 6617; AVX2-NEXT: vpshufb %ymm14, %ymm0, %ymm6 6618; AVX2-NEXT: vpblendvb %ymm12, %ymm4, %ymm6, %ymm4 6619; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6620; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u] 6621; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 6622; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] 6623; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1 6624; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 6625; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13] 6626; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm3 6627; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm3, %ymm1 6628; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6629; AVX2-NEXT: vpshufb %xmm4, %xmm15, %xmm1 6630; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm3 6631; AVX2-NEXT: vpor %xmm1, %xmm3, %xmm1 6632; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 6633; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm14 6634; AVX2-NEXT: vmovdqa 160(%rdi), %ymm0 6635; AVX2-NEXT: vmovdqa 128(%rdi), %ymm3 6636; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] 6637; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm1 6638; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6639; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] 6640; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm5 6641; AVX2-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm15 6642; AVX2-NEXT: vmovdqa 352(%rdi), %ymm4 6643; AVX2-NEXT: vmovdqa 320(%rdi), %ymm6 6644; AVX2-NEXT: vpblendvb %ymm1, %ymm6, %ymm4, %ymm1 6645; AVX2-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm12 6646; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm6, %ymm0 6647; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6648; AVX2-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload 6649; AVX2-NEXT: vpblendvb %ymm2, %ymm10, %ymm0, %ymm8 6650; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6651; AVX2-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm10 6652; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] 6653; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6654; AVX2-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload 6655; AVX2-NEXT: vpblendvb %ymm2, %ymm11, %ymm9, %ymm0 6656; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6657; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] 6658; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm6 6659; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm11 6660; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12] 6661; AVX2-NEXT: vpshufb %xmm0, %xmm11, %xmm9 6662; AVX2-NEXT: vpor %xmm6, %xmm9, %xmm6 6663; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 6664; AVX2-NEXT: vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9 6665; AVX2-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload 6666; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6667; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm2 6668; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm6 6669; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm0 6670; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 6671; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6672; AVX2-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 6673; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6674; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128] 6675; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm2 6676; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13] 6677; AVX2-NEXT: vpshufb %xmm5, %xmm11, %xmm11 6678; AVX2-NEXT: vpor %xmm2, %xmm11, %xmm2 6679; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 6680; AVX2-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 6681; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6682; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 6683; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm1 6684; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 6685; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6686; AVX2-NEXT: vpblendvb %ymm9, %ymm14, %ymm0, %ymm0 6687; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 6688; AVX2-NEXT: vextracti128 $1, %ymm15, %xmm14 6689; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] 6690; AVX2-NEXT: vpshufb %xmm7, %xmm14, %xmm0 6691; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] 6692; AVX2-NEXT: vpshufb %xmm2, %xmm15, %xmm1 6693; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm1 6694; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm3 6695; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u] 6696; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6 6697; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] 6698; AVX2-NEXT: vpshufb %xmm4, %xmm8, %xmm11 6699; AVX2-NEXT: vpor %xmm6, %xmm11, %xmm6 6700; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm11 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] 6701; AVX2-NEXT: vpshufb %ymm11, %ymm13, %ymm0 6702; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm0[5,6,7] 6703; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] 6704; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 6705; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 6706; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6707; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm1 6708; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm0 6709; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm2 6710; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 6711; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm2 6712; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5 6713; AVX2-NEXT: vpshufb %xmm4, %xmm10, %xmm4 6714; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm4 6715; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6716; AVX2-NEXT: vpshufb %ymm11, %ymm6, %ymm5 6717; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] 6718; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] 6719; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6720; AVX2-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm5 6721; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15] 6722; AVX2-NEXT: vpshufb %xmm0, %xmm14, %xmm4 6723; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128] 6724; AVX2-NEXT: vpshufb %xmm7, %xmm15, %xmm11 6725; AVX2-NEXT: vpor %xmm4, %xmm11, %xmm4 6726; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] 6727; AVX2-NEXT: vpshufb %xmm11, %xmm3, %xmm3 6728; AVX2-NEXT: vmovdqa {{.*#+}} xmm14 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u] 6729; AVX2-NEXT: vpshufb %xmm14, %xmm8, %xmm8 6730; AVX2-NEXT: vpor %xmm3, %xmm8, %xmm3 6731; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15] 6732; AVX2-NEXT: vpshufb %ymm8, %ymm13, %ymm13 6733; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm13[5,6,7] 6734; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4,5,6,7] 6735; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 6736; AVX2-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm4 6737; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 6738; AVX2-NEXT: vpshufb %xmm7, %xmm12, %xmm1 6739; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 6740; AVX2-NEXT: vpshufb %xmm11, %xmm2, %xmm1 6741; AVX2-NEXT: vpshufb %xmm14, %xmm10, %xmm2 6742; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 6743; AVX2-NEXT: vpshufb %ymm8, %ymm6, %ymm2 6744; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] 6745; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 6746; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6747; AVX2-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm3 6748; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 6749; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm0 6750; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] 6751; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm2 6752; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128] 6753; AVX2-NEXT: vpshufb %xmm7, %xmm9, %xmm8 6754; AVX2-NEXT: vmovdqa %ymm9, %ymm10 6755; AVX2-NEXT: vpor %xmm2, %xmm8, %xmm2 6756; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 6757; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 6758; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] 6759; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] 6760; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 6761; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm8 6762; AVX2-NEXT: vpshufb %xmm1, %xmm8, %xmm1 6763; AVX2-NEXT: vpshufb %xmm7, %xmm9, %xmm7 6764; AVX2-NEXT: vmovdqa %ymm9, %ymm11 6765; AVX2-NEXT: vpor %xmm1, %xmm7, %xmm1 6766; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 6767; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 6768; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7],ymm7[8,9,10],ymm1[11,12,13,14,15] 6769; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] 6770; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11] 6771; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm0 6772; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128] 6773; AVX2-NEXT: vpshufb %xmm9, %xmm10, %xmm10 6774; AVX2-NEXT: vpor %xmm0, %xmm10, %xmm0 6775; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6776; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 6777; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] 6778; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] 6779; AVX2-NEXT: vpshufb %xmm7, %xmm8, %xmm7 6780; AVX2-NEXT: vpshufb %xmm9, %xmm11, %xmm8 6781; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7 6782; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 6783; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 6784; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] 6785; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] 6786; AVX2-NEXT: vmovdqa %ymm1, 32(%rsi) 6787; AVX2-NEXT: vmovdqa %ymm2, (%rsi) 6788; AVX2-NEXT: vmovdqa %ymm7, 32(%rdx) 6789; AVX2-NEXT: vmovdqa %ymm0, (%rdx) 6790; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6791; AVX2-NEXT: vmovaps %ymm0, 32(%rcx) 6792; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6793; AVX2-NEXT: vmovaps %ymm0, (%rcx) 6794; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 6795; AVX2-NEXT: vmovaps %ymm0, 32(%r8) 6796; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6797; AVX2-NEXT: vmovaps %ymm0, (%r8) 6798; AVX2-NEXT: vmovdqa %ymm5, 32(%r9) 6799; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6800; AVX2-NEXT: vmovaps %ymm0, (%r9) 6801; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax 6802; AVX2-NEXT: vmovdqa %ymm3, 32(%rax) 6803; AVX2-NEXT: vmovdqa %ymm4, (%rax) 6804; AVX2-NEXT: addq $328, %rsp # imm = 0x148 6805; AVX2-NEXT: vzeroupper 6806; AVX2-NEXT: retq 6807; 6808; AVX2-FP-LABEL: load_i8_stride6_vf64: 6809; AVX2-FP: # %bb.0: 6810; AVX2-FP-NEXT: subq $328, %rsp # imm = 0x148 6811; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm7 6812; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3 6813; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm5 6814; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6815; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 6816; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1 6817; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] 6818; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[0,1],ymm1[0,1] 6819; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6820; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 6821; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6822; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm2, %ymm0, %ymm4 6823; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] 6824; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] 6825; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm2 6826; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm5 6827; AVX2-FP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill 6828; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] 6829; AVX2-FP-NEXT: vpshufb %xmm10, %xmm2, %xmm9 6830; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 6831; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] 6832; AVX2-FP-NEXT: vpshufb %xmm11, %xmm3, %xmm12 6833; AVX2-FP-NEXT: vpor %xmm9, %xmm12, %xmm9 6834; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} xmm12 = [18446744073709551615,16777215] 6835; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm9, %ymm0, %ymm0 6836; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6837; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm8 6838; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm14 6839; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6840; AVX2-FP-NEXT: vpshufb %xmm10, %xmm14, %xmm0 6841; AVX2-FP-NEXT: vextracti128 $1, %ymm14, %xmm15 6842; AVX2-FP-NEXT: vpshufb %xmm11, %xmm15, %xmm10 6843; AVX2-FP-NEXT: vpor %xmm0, %xmm10, %xmm1 6844; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm11 6845; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm0 6846; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm0[0,1],ymm11[0,1] 6847; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm0[2,3],ymm11[2,3] 6848; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm9, %ymm11, %ymm13 6849; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] 6850; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 6851; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6852; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u] 6853; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm1 6854; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u] 6855; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 6856; AVX2-FP-NEXT: vpor %xmm1, %xmm3, %xmm1 6857; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11] 6858; AVX2-FP-NEXT: vpshufb %ymm3, %ymm4, %ymm4 6859; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 6860; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6861; AVX2-FP-NEXT: vpshufb %xmm0, %xmm14, %xmm0 6862; AVX2-FP-NEXT: vpshufb %xmm2, %xmm15, %xmm1 6863; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm0 6864; AVX2-FP-NEXT: vpshufb %ymm3, %ymm13, %ymm1 6865; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 6866; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6867; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] 6868; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 6869; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm10, %ymm5, %ymm1 6870; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 6871; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] 6872; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm3 6873; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] 6874; AVX2-FP-NEXT: vpshufb %xmm4, %xmm1, %xmm5 6875; AVX2-FP-NEXT: vpor %xmm3, %xmm5, %xmm5 6876; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] 6877; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0] 6878; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6879; AVX2-FP-NEXT: vpblendvb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 6880; AVX2-FP-NEXT: vpshufb %ymm14, %ymm3, %ymm15 6881; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm5, %ymm15, %ymm5 6882; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6883; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm8, %ymm7, %ymm5 6884; AVX2-FP-NEXT: vmovdqa %ymm8, %ymm7 6885; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm15 6886; AVX2-FP-NEXT: vpshufb %xmm6, %xmm15, %xmm6 6887; AVX2-FP-NEXT: vpshufb %xmm4, %xmm5, %xmm4 6888; AVX2-FP-NEXT: vpor %xmm6, %xmm4, %xmm4 6889; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm11, %ymm9, %ymm0 6890; AVX2-FP-NEXT: vpshufb %ymm14, %ymm0, %ymm6 6891; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm4, %ymm6, %ymm4 6892; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6893; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u] 6894; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 6895; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] 6896; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 6897; AVX2-FP-NEXT: vpor %xmm2, %xmm1, %xmm1 6898; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13] 6899; AVX2-FP-NEXT: vpshufb %ymm2, %ymm3, %ymm3 6900; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm3, %ymm1 6901; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6902; AVX2-FP-NEXT: vpshufb %xmm4, %xmm15, %xmm1 6903; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm3 6904; AVX2-FP-NEXT: vpor %xmm1, %xmm3, %xmm1 6905; AVX2-FP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 6906; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm14 6907; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm0 6908; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm3 6909; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] 6910; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm1 6911; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6912; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] 6913; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm5 6914; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm15 6915; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm4 6916; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm6 6917; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm6, %ymm4, %ymm1 6918; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm12 6919; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm4, %ymm6, %ymm0 6920; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6921; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload 6922; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm10, %ymm0, %ymm8 6923; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6924; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm10 6925; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] 6926; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6927; AVX2-FP-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload 6928; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm11, %ymm9, %ymm0 6929; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6930; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] 6931; AVX2-FP-NEXT: vpshufb %xmm2, %xmm5, %xmm6 6932; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm11 6933; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12] 6934; AVX2-FP-NEXT: vpshufb %xmm0, %xmm11, %xmm9 6935; AVX2-FP-NEXT: vpor %xmm6, %xmm9, %xmm6 6936; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 6937; AVX2-FP-NEXT: vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9 6938; AVX2-FP-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload 6939; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6940; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 6941; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm6 6942; AVX2-FP-NEXT: vpshufb %xmm0, %xmm6, %xmm0 6943; AVX2-FP-NEXT: vpor %xmm2, %xmm0, %xmm0 6944; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6945; AVX2-FP-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 6946; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6947; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128] 6948; AVX2-FP-NEXT: vpshufb %xmm0, %xmm5, %xmm2 6949; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13] 6950; AVX2-FP-NEXT: vpshufb %xmm5, %xmm11, %xmm11 6951; AVX2-FP-NEXT: vpor %xmm2, %xmm11, %xmm2 6952; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 6953; AVX2-FP-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 6954; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6955; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 6956; AVX2-FP-NEXT: vpshufb %xmm5, %xmm6, %xmm1 6957; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm0 6958; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6959; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm14, %ymm0, %ymm0 6960; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 6961; AVX2-FP-NEXT: vextracti128 $1, %ymm15, %xmm14 6962; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] 6963; AVX2-FP-NEXT: vpshufb %xmm7, %xmm14, %xmm0 6964; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] 6965; AVX2-FP-NEXT: vpshufb %xmm2, %xmm15, %xmm1 6966; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm1 6967; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm3 6968; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u] 6969; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm6 6970; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] 6971; AVX2-FP-NEXT: vpshufb %xmm4, %xmm8, %xmm11 6972; AVX2-FP-NEXT: vpor %xmm6, %xmm11, %xmm6 6973; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm11 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] 6974; AVX2-FP-NEXT: vpshufb %ymm11, %ymm13, %ymm0 6975; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm0[5,6,7] 6976; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] 6977; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 6978; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 6979; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6980; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm1 6981; AVX2-FP-NEXT: vpshufb %xmm7, %xmm1, %xmm0 6982; AVX2-FP-NEXT: vpshufb %xmm2, %xmm12, %xmm2 6983; AVX2-FP-NEXT: vpor %xmm0, %xmm2, %xmm0 6984; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm2 6985; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 6986; AVX2-FP-NEXT: vpshufb %xmm4, %xmm10, %xmm4 6987; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm4 6988; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6989; AVX2-FP-NEXT: vpshufb %ymm11, %ymm6, %ymm5 6990; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] 6991; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] 6992; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6993; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm5 6994; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15] 6995; AVX2-FP-NEXT: vpshufb %xmm0, %xmm14, %xmm4 6996; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128] 6997; AVX2-FP-NEXT: vpshufb %xmm7, %xmm15, %xmm11 6998; AVX2-FP-NEXT: vpor %xmm4, %xmm11, %xmm4 6999; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] 7000; AVX2-FP-NEXT: vpshufb %xmm11, %xmm3, %xmm3 7001; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm14 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u] 7002; AVX2-FP-NEXT: vpshufb %xmm14, %xmm8, %xmm8 7003; AVX2-FP-NEXT: vpor %xmm3, %xmm8, %xmm3 7004; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15] 7005; AVX2-FP-NEXT: vpshufb %ymm8, %ymm13, %ymm13 7006; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm13[5,6,7] 7007; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4,5,6,7] 7008; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 7009; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm4 7010; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 7011; AVX2-FP-NEXT: vpshufb %xmm7, %xmm12, %xmm1 7012; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm0 7013; AVX2-FP-NEXT: vpshufb %xmm11, %xmm2, %xmm1 7014; AVX2-FP-NEXT: vpshufb %xmm14, %xmm10, %xmm2 7015; AVX2-FP-NEXT: vpor %xmm1, %xmm2, %xmm1 7016; AVX2-FP-NEXT: vpshufb %ymm8, %ymm6, %ymm2 7017; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] 7018; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 7019; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7020; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm3 7021; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 7022; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm0 7023; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] 7024; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm2 7025; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128] 7026; AVX2-FP-NEXT: vpshufb %xmm7, %xmm9, %xmm8 7027; AVX2-FP-NEXT: vmovdqa %ymm9, %ymm10 7028; AVX2-FP-NEXT: vpor %xmm2, %xmm8, %xmm2 7029; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 7030; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 7031; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] 7032; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] 7033; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 7034; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm8 7035; AVX2-FP-NEXT: vpshufb %xmm1, %xmm8, %xmm1 7036; AVX2-FP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 7037; AVX2-FP-NEXT: vmovdqa %ymm9, %ymm11 7038; AVX2-FP-NEXT: vpor %xmm1, %xmm7, %xmm1 7039; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 7040; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 7041; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7],ymm7[8,9,10],ymm1[11,12,13,14,15] 7042; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] 7043; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11] 7044; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 7045; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128] 7046; AVX2-FP-NEXT: vpshufb %xmm9, %xmm10, %xmm10 7047; AVX2-FP-NEXT: vpor %xmm0, %xmm10, %xmm0 7048; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7049; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 7050; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] 7051; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] 7052; AVX2-FP-NEXT: vpshufb %xmm7, %xmm8, %xmm7 7053; AVX2-FP-NEXT: vpshufb %xmm9, %xmm11, %xmm8 7054; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm7 7055; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 7056; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 7057; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] 7058; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] 7059; AVX2-FP-NEXT: vmovdqa %ymm1, 32(%rsi) 7060; AVX2-FP-NEXT: vmovdqa %ymm2, (%rsi) 7061; AVX2-FP-NEXT: vmovdqa %ymm7, 32(%rdx) 7062; AVX2-FP-NEXT: vmovdqa %ymm0, (%rdx) 7063; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7064; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rcx) 7065; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7066; AVX2-FP-NEXT: vmovaps %ymm0, (%rcx) 7067; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 7068; AVX2-FP-NEXT: vmovaps %ymm0, 32(%r8) 7069; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7070; AVX2-FP-NEXT: vmovaps %ymm0, (%r8) 7071; AVX2-FP-NEXT: vmovdqa %ymm5, 32(%r9) 7072; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7073; AVX2-FP-NEXT: vmovaps %ymm0, (%r9) 7074; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax 7075; AVX2-FP-NEXT: vmovdqa %ymm3, 32(%rax) 7076; AVX2-FP-NEXT: vmovdqa %ymm4, (%rax) 7077; AVX2-FP-NEXT: addq $328, %rsp # imm = 0x148 7078; AVX2-FP-NEXT: vzeroupper 7079; AVX2-FP-NEXT: retq 7080; 7081; AVX2-FCP-LABEL: load_i8_stride6_vf64: 7082; AVX2-FCP: # %bb.0: 7083; AVX2-FCP-NEXT: subq $328, %rsp # imm = 0x148 7084; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm7 7085; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3 7086; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 7087; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7088; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 7089; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 7090; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] 7091; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[0,1],ymm1[0,1] 7092; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7093; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 7094; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7095; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm2, %ymm0, %ymm4 7096; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] 7097; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] 7098; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm2 7099; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm5 7100; AVX2-FCP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill 7101; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] 7102; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm9 7103; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 7104; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] 7105; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm12 7106; AVX2-FCP-NEXT: vpor %xmm9, %xmm12, %xmm9 7107; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} xmm12 = [18446744073709551615,16777215] 7108; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm9, %ymm0, %ymm0 7109; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7110; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm8 7111; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm14 7112; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7113; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm14, %xmm0 7114; AVX2-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 7115; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm15, %xmm10 7116; AVX2-FCP-NEXT: vpor %xmm0, %xmm10, %xmm1 7117; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm11 7118; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm0 7119; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm0[0,1],ymm11[0,1] 7120; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm0[2,3],ymm11[2,3] 7121; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm9, %ymm11, %ymm13 7122; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] 7123; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 7124; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7125; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u] 7126; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm1 7127; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u] 7128; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 7129; AVX2-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 7130; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11] 7131; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4 7132; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 7133; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7134; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm0 7135; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm1 7136; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 7137; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm1 7138; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 7139; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7140; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] 7141; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 7142; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm10, %ymm5, %ymm1 7143; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 7144; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] 7145; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm3 7146; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] 7147; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm5 7148; AVX2-FCP-NEXT: vpor %xmm3, %xmm5, %xmm5 7149; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] 7150; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0] 7151; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7152; AVX2-FCP-NEXT: vpblendvb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 7153; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm15 7154; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm5, %ymm15, %ymm5 7155; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7156; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm8, %ymm7, %ymm5 7157; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm7 7158; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm15 7159; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm15, %xmm6 7160; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm4 7161; AVX2-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4 7162; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm11, %ymm9, %ymm0 7163; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm6 7164; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm4, %ymm6, %ymm4 7165; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7166; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u] 7167; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 7168; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] 7169; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 7170; AVX2-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 7171; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13] 7172; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm3 7173; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm1, %ymm3, %ymm1 7174; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7175; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm15, %xmm1 7176; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm3 7177; AVX2-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 7178; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 7179; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm14 7180; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm0 7181; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 7182; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0] 7183; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm1 7184; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7185; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] 7186; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm5 7187; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm15 7188; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm4 7189; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm6 7190; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm6, %ymm4, %ymm1 7191; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm12 7192; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm4, %ymm6, %ymm0 7193; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7194; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload 7195; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm10, %ymm0, %ymm8 7196; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7197; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm10 7198; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0] 7199; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7200; AVX2-FCP-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload 7201; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm11, %ymm9, %ymm0 7202; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7203; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] 7204; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm6 7205; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm11 7206; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12] 7207; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm9 7208; AVX2-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 7209; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 7210; AVX2-FCP-NEXT: vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9 7211; AVX2-FCP-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload 7212; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7213; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 7214; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 7215; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm0 7216; AVX2-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 7217; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7218; AVX2-FCP-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 7219; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7220; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128] 7221; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm2 7222; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13] 7223; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm11 7224; AVX2-FCP-NEXT: vpor %xmm2, %xmm11, %xmm2 7225; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 7226; AVX2-FCP-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 7227; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7228; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 7229; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm1 7230; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 7231; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7232; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm14, %ymm0, %ymm0 7233; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill 7234; AVX2-FCP-NEXT: vextracti128 $1, %ymm15, %xmm14 7235; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] 7236; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm14, %xmm0 7237; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] 7238; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm1 7239; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm1 7240; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm3 7241; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u] 7242; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm6 7243; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] 7244; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm11 7245; AVX2-FCP-NEXT: vpor %xmm6, %xmm11, %xmm6 7246; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm11 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] 7247; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm13, %ymm0 7248; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm0[5,6,7] 7249; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] 7250; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 7251; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 7252; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7253; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm1 7254; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm0 7255; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm2 7256; AVX2-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 7257; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm2 7258; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 7259; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm4 7260; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 7261; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 7262; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm5 7263; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] 7264; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] 7265; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7266; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm5 7267; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15] 7268; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm4 7269; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128] 7270; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm15, %xmm11 7271; AVX2-FCP-NEXT: vpor %xmm4, %xmm11, %xmm4 7272; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] 7273; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm3 7274; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u] 7275; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8 7276; AVX2-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3 7277; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15] 7278; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm13, %ymm13 7279; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm13[5,6,7] 7280; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4,5,6,7] 7281; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 7282; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm4 7283; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 7284; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm12, %xmm1 7285; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 7286; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm1 7287; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm10, %xmm2 7288; AVX2-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 7289; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm2 7290; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] 7291; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 7292; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7293; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm3 7294; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 7295; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm0 7296; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] 7297; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm2 7298; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128] 7299; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm8 7300; AVX2-FCP-NEXT: vmovdqa %ymm9, %ymm10 7301; AVX2-FCP-NEXT: vpor %xmm2, %xmm8, %xmm2 7302; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 7303; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 7304; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] 7305; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] 7306; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 7307; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm8 7308; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm1 7309; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 7310; AVX2-FCP-NEXT: vmovdqa %ymm9, %ymm11 7311; AVX2-FCP-NEXT: vpor %xmm1, %xmm7, %xmm1 7312; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 7313; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 7314; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7],ymm7[8,9,10],ymm1[11,12,13,14,15] 7315; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] 7316; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11] 7317; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 7318; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128] 7319; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm10, %xmm10 7320; AVX2-FCP-NEXT: vpor %xmm0, %xmm10, %xmm0 7321; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7322; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 7323; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] 7324; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] 7325; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm7 7326; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm8 7327; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 7328; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 7329; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 7330; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] 7331; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] 7332; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%rsi) 7333; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rsi) 7334; AVX2-FCP-NEXT: vmovdqa %ymm7, 32(%rdx) 7335; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rdx) 7336; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7337; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rcx) 7338; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7339; AVX2-FCP-NEXT: vmovaps %ymm0, (%rcx) 7340; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 7341; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%r8) 7342; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7343; AVX2-FCP-NEXT: vmovaps %ymm0, (%r8) 7344; AVX2-FCP-NEXT: vmovdqa %ymm5, 32(%r9) 7345; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7346; AVX2-FCP-NEXT: vmovaps %ymm0, (%r9) 7347; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 7348; AVX2-FCP-NEXT: vmovdqa %ymm3, 32(%rax) 7349; AVX2-FCP-NEXT: vmovdqa %ymm4, (%rax) 7350; AVX2-FCP-NEXT: addq $328, %rsp # imm = 0x148 7351; AVX2-FCP-NEXT: vzeroupper 7352; AVX2-FCP-NEXT: retq 7353; 7354; AVX512-LABEL: load_i8_stride6_vf64: 7355; AVX512: # %bb.0: 7356; AVX512-NEXT: subq $40, %rsp 7357; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] 7358; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm25 7359; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm26 7360; AVX512-NEXT: vmovdqa %ymm12, %ymm0 7361; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25)) 7362; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] 7363; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm3 7364; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] 7365; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4 7366; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm6 7367; AVX512-NEXT: vpor %xmm3, %xmm6, %xmm9 7368; AVX512-NEXT: vmovdqa64 (%rdi), %ymm30 7369; AVX512-NEXT: vmovdqa64 32(%rdi), %ymm31 7370; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm24 7371; AVX512-NEXT: vmovdqa64 160(%rdi), %ymm18 7372; AVX512-NEXT: vmovdqa %ymm12, %ymm6 7373; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm18 ^ ymm24)) 7374; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7 7375; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] 7376; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm10 7377; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128] 7378; AVX512-NEXT: vpshufb %xmm8, %xmm6, %xmm13 7379; AVX512-NEXT: vpor %xmm10, %xmm13, %xmm10 7380; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 7381; AVX512-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm2 7382; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7383; AVX512-NEXT: vmovdqa %ymm12, %ymm9 7384; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm30 ^ ymm31)) 7385; AVX512-NEXT: vpshufb %xmm1, %xmm9, %xmm1 7386; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm13 7387; AVX512-NEXT: vpshufb %xmm5, %xmm13, %xmm5 7388; AVX512-NEXT: vporq %xmm1, %xmm5, %xmm17 7389; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm29 7390; AVX512-NEXT: vmovdqa64 352(%rdi), %ymm22 7391; AVX512-NEXT: vmovdqa %ymm12, %ymm1 7392; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm29 ^ (ymm1 & (ymm22 ^ ymm29)) 7393; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm5 7394; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm3 7395; AVX512-NEXT: vpshufb %xmm8, %xmm1, %xmm8 7396; AVX512-NEXT: vpor %xmm3, %xmm8, %xmm3 7397; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 7398; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u] 7399; AVX512-NEXT: vpshufb %xmm8, %xmm0, %xmm0 7400; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u] 7401; AVX512-NEXT: vpshufb %xmm10, %xmm4, %xmm4 7402; AVX512-NEXT: vpor %xmm0, %xmm4, %xmm0 7403; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7404; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11] 7405; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm4 7406; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128] 7407; AVX512-NEXT: vpshufb %xmm7, %xmm6, %xmm6 7408; AVX512-NEXT: vporq %xmm4, %xmm6, %xmm28 7409; AVX512-NEXT: vpshufb %xmm8, %xmm9, %xmm4 7410; AVX512-NEXT: vpshufb %xmm10, %xmm13, %xmm6 7411; AVX512-NEXT: vporq %xmm4, %xmm6, %xmm21 7412; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm0 7413; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm1 7414; AVX512-NEXT: vporq %xmm0, %xmm1, %xmm27 7415; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] 7416; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] 7417; AVX512-NEXT: vmovdqa %ymm9, %ymm4 7418; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm26 ^ (ymm4 & (ymm25 ^ ymm26)) 7419; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm15 7420; AVX512-NEXT: vpshufb %xmm0, %xmm15, %xmm1 7421; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] 7422; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm5 7423; AVX512-NEXT: vpor %xmm1, %xmm5, %xmm1 7424; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7425; AVX512-NEXT: vmovdqa %ymm12, %ymm5 7426; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm18 ^ (ymm5 & (ymm24 ^ ymm18)) 7427; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] 7428; AVX512-NEXT: vpshufb %xmm8, %xmm5, %xmm7 7429; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm1 7430; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12] 7431; AVX512-NEXT: vpshufb %xmm10, %xmm1, %xmm13 7432; AVX512-NEXT: vpor %xmm7, %xmm13, %xmm2 7433; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7434; AVX512-NEXT: vmovdqa %ymm9, %ymm13 7435; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm30 ^ (ymm13 & (ymm31 ^ ymm30)) 7436; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm14 7437; AVX512-NEXT: vpshufb %xmm0, %xmm14, %xmm0 7438; AVX512-NEXT: vpshufb %xmm6, %xmm13, %xmm6 7439; AVX512-NEXT: vporq %xmm0, %xmm6, %xmm16 7440; AVX512-NEXT: vmovdqa %ymm12, %ymm11 7441; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm22 ^ (ymm11 & (ymm29 ^ ymm22)) 7442; AVX512-NEXT: vpshufb %xmm8, %xmm11, %xmm8 7443; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm7 7444; AVX512-NEXT: vpshufb %xmm10, %xmm7, %xmm10 7445; AVX512-NEXT: vpor %xmm8, %xmm10, %xmm0 7446; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7447; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u] 7448; AVX512-NEXT: vpshufb %xmm10, %xmm15, %xmm15 7449; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] 7450; AVX512-NEXT: vpshufb %xmm8, %xmm4, %xmm4 7451; AVX512-NEXT: vpor %xmm4, %xmm15, %xmm0 7452; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7453; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero 7454; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[1,7,13] 7455; AVX512-NEXT: vpor %xmm1, %xmm15, %xmm0 7456; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7457; AVX512-NEXT: vmovdqa 256(%rdi), %ymm1 7458; AVX512-NEXT: vshufi64x2 {{.*#+}} ymm19 = ymm1[2,3],mem[2,3] 7459; AVX512-NEXT: vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20 7460; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] 7461; AVX512-NEXT: vmovdqa %ymm5, %ymm1 7462; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm19 ^ (ymm1 & (ymm20 ^ ymm19)) 7463; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] 7464; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15] 7465; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7] 7466; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm23 7467; AVX512-NEXT: vshufi64x2 {{.*#+}} ymm6 = ymm23[2,3],mem[2,3] 7468; AVX512-NEXT: vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23 7469; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 7470; AVX512-NEXT: vmovdqa %ymm5, %ymm2 7471; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm6 ^ (ymm2 & (ymm23 ^ ymm6)) 7472; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] 7473; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 7474; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm17 & ymm4) 7475; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 7476; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload 7477; AVX512-NEXT: # zmm0 = mem ^ (zmm17 & (zmm0 ^ mem)) 7478; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] 7479; AVX512-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm3 & (zmm15 ^ zmm0)) 7480; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] 7481; AVX512-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm1 7482; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] 7483; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7484; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 7485; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm21 & ymm4) 7486; AVX512-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm2 7487; AVX512-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload 7488; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm17 & (zmm1 ^ zmm2)) 7489; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm17 7490; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm3 & (zmm17 ^ zmm1)) 7491; AVX512-NEXT: vpshufb %xmm10, %xmm14, %xmm0 7492; AVX512-NEXT: vpshufb %xmm8, %xmm13, %xmm1 7493; AVX512-NEXT: vporq %xmm0, %xmm1, %xmm21 7494; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,1,7,13],zero,zero,zero,xmm11[5,11],zero,zero,zero 7495; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13] 7496; AVX512-NEXT: vporq %xmm0, %xmm1, %xmm28 7497; AVX512-NEXT: vmovdqa64 %ymm25, %ymm11 7498; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm12 & (ymm11 ^ ymm26)) 7499; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm0 7500; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u] 7501; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] 7502; AVX512-NEXT: vpshufb %xmm3, %xmm11, %xmm2 7503; AVX512-NEXT: vmovdqa64 %xmm3, %xmm25 7504; AVX512-NEXT: vporq %xmm1, %xmm2, %xmm26 7505; AVX512-NEXT: vmovdqa64 %ymm18, %ymm14 7506; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm24 ^ (ymm9 & (ymm14 ^ ymm24)) 7507; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm10 7508; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] 7509; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm2 7510; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] 7511; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm4 7512; AVX512-NEXT: vporq %xmm2, %xmm4, %xmm27 7513; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm31 ^ ymm30)) 7514; AVX512-NEXT: vmovdqa %ymm5, %ymm4 7515; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm6 ^ ymm23)) 7516; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm29 ^ (ymm9 & (ymm22 ^ ymm29)) 7517; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm8 7518; AVX512-NEXT: vpshufb %xmm1, %xmm8, %xmm1 7519; AVX512-NEXT: vpshufb %xmm3, %xmm9, %xmm2 7520; AVX512-NEXT: vpor %xmm1, %xmm2, %xmm7 7521; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] 7522; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0 7523; AVX512-NEXT: vmovdqa64 %xmm1, %xmm22 7524; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u] 7525; AVX512-NEXT: vpshufb %xmm13, %xmm11, %xmm1 7526; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm3 7527; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15] 7528; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm1 7529; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128] 7530; AVX512-NEXT: vpshufb %xmm0, %xmm14, %xmm10 7531; AVX512-NEXT: vpor %xmm1, %xmm10, %xmm10 7532; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128] 7533; AVX512-NEXT: vpshufb %ymm1, %ymm4, %ymm11 7534; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] 7535; AVX512-NEXT: vpternlogq {{.*#+}} ymm16 = (ymm16 & ymm18) | ymm11 7536; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] 7537; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm4 7538; AVX512-NEXT: vpternlogq {{.*#+}} ymm21 = (ymm21 & ymm18) | ymm4 7539; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload 7540; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm20 ^ (ymm5 & (ymm19 ^ ymm20)) 7541; AVX512-NEXT: vpshufb %ymm1, %ymm5, %ymm1 7542; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm4 & ymm18) 7543; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 7544; AVX512-NEXT: vpshufb %ymm11, %ymm5, %ymm5 7545; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm1 7546; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[4,10,u,u,u,u,u,u] 7547; AVX512-NEXT: vmovdqa64 %xmm25, %xmm14 7548; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm14 7549; AVX512-NEXT: vpor %xmm11, %xmm14, %xmm11 7550; AVX512-NEXT: vpshufb %xmm2, %xmm8, %xmm2 7551; AVX512-NEXT: vpshufb %xmm0, %xmm9, %xmm0 7552; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 7553; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] 7554; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm23 ^ (ymm2 & (ymm6 ^ ymm23)) 7555; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] 7556; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3,4],xmm8[5,6,7] 7557; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 7558; AVX512-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm9 7559; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm9 & ymm18) 7560; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 7561; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm19 ^ ymm20)) 7562; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 7563; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 7564; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 7565; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm7 & ~ymm9) 7566; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm7 7567; AVX512-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm11 7568; AVX512-NEXT: vinserti32x4 $2, %xmm26, %zmm11, %zmm11 7569; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm11 ^ (zmm9 & (zmm8 ^ zmm11)) 7570; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] 7571; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm11 & (zmm7 ^ zmm8)) 7572; AVX512-NEXT: vmovdqa64 %xmm22, %xmm8 7573; AVX512-NEXT: vpshufb %xmm8, %xmm1, %xmm1 7574; AVX512-NEXT: vpshufb %xmm13, %xmm12, %xmm8 7575; AVX512-NEXT: vpor %xmm1, %xmm8, %xmm1 7576; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] 7577; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] 7578; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] 7579; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7580; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 7581; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ~ymm9) 7582; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 7583; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm2 7584; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 7585; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm9 & (zmm1 ^ zmm2)) 7586; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm11 & (zmm0 ^ zmm1)) 7587; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload 7588; AVX512-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload 7589; AVX512-NEXT: vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] 7590; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm2 & (zmm16 ^ zmm1)) 7591; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload 7592; AVX512-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload 7593; AVX512-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm2 & (zmm21 ^ zmm1)) 7594; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] 7595; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm16)) 7596; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm21)) 7597; AVX512-NEXT: vmovdqa64 %zmm15, (%rsi) 7598; AVX512-NEXT: vmovdqa64 %zmm17, (%rdx) 7599; AVX512-NEXT: vmovdqa64 %zmm4, (%rcx) 7600; AVX512-NEXT: vmovdqa64 %zmm5, (%r8) 7601; AVX512-NEXT: vmovdqa64 %zmm7, (%r9) 7602; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax 7603; AVX512-NEXT: vmovdqa64 %zmm0, (%rax) 7604; AVX512-NEXT: addq $40, %rsp 7605; AVX512-NEXT: vzeroupper 7606; AVX512-NEXT: retq 7607; 7608; AVX512-FCP-LABEL: load_i8_stride6_vf64: 7609; AVX512-FCP: # %bb.0: 7610; AVX512-FCP-NEXT: subq $40, %rsp 7611; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] 7612; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm25 7613; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26 7614; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm0 7615; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25)) 7616; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] 7617; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm3 7618; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] 7619; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 7620; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm6 7621; AVX512-FCP-NEXT: vpor %xmm3, %xmm6, %xmm9 7622; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm30 7623; AVX512-FCP-NEXT: vmovdqa64 32(%rdi), %ymm31 7624; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm24 7625; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm18 7626; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm6 7627; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm18 ^ ymm24)) 7628; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 7629; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] 7630; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm10 7631; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128] 7632; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm13 7633; AVX512-FCP-NEXT: vpor %xmm10, %xmm13, %xmm10 7634; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 7635; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm2 7636; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7637; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm9 7638; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm30 ^ ymm31)) 7639; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1 7640; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm13 7641; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm13, %xmm5 7642; AVX512-FCP-NEXT: vporq %xmm1, %xmm5, %xmm17 7643; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm29 7644; AVX512-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22 7645; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm1 7646; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm29 ^ (ymm1 & (ymm22 ^ ymm29)) 7647; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 7648; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 7649; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm8 7650; AVX512-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3 7651; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 7652; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u] 7653; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm0 7654; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u] 7655; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm4 7656; AVX512-FCP-NEXT: vpor %xmm0, %xmm4, %xmm0 7657; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7658; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11] 7659; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm4 7660; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128] 7661; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6 7662; AVX512-FCP-NEXT: vporq %xmm4, %xmm6, %xmm28 7663; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm4 7664; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm6 7665; AVX512-FCP-NEXT: vporq %xmm4, %xmm6, %xmm21 7666; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm0 7667; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 7668; AVX512-FCP-NEXT: vporq %xmm0, %xmm1, %xmm27 7669; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] 7670; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] 7671; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm4 7672; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm26 ^ (ymm4 & (ymm25 ^ ymm26)) 7673; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm15 7674; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm1 7675; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] 7676; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm5 7677; AVX512-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1 7678; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7679; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm5 7680; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm18 ^ (ymm5 & (ymm24 ^ ymm18)) 7681; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] 7682; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm7 7683; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1 7684; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12] 7685; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm13 7686; AVX512-FCP-NEXT: vpor %xmm7, %xmm13, %xmm2 7687; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7688; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm13 7689; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm30 ^ (ymm13 & (ymm31 ^ ymm30)) 7690; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 7691; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm0 7692; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm6 7693; AVX512-FCP-NEXT: vporq %xmm0, %xmm6, %xmm16 7694; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm11 7695; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm22 ^ (ymm11 & (ymm29 ^ ymm22)) 7696; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm11, %xmm8 7697; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm7 7698; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm10 7699; AVX512-FCP-NEXT: vpor %xmm8, %xmm10, %xmm0 7700; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7701; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u] 7702; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm15 7703; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] 7704; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm4 7705; AVX512-FCP-NEXT: vpor %xmm4, %xmm15, %xmm0 7706; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7707; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero 7708; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[1,7,13] 7709; AVX512-FCP-NEXT: vpor %xmm1, %xmm15, %xmm0 7710; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7711; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm1 7712; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} ymm19 = ymm1[2,3],mem[2,3] 7713; AVX512-FCP-NEXT: vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20 7714; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] 7715; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm1 7716; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm19 ^ (ymm1 & (ymm20 ^ ymm19)) 7717; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] 7718; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15] 7719; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7] 7720; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm23 7721; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} ymm6 = ymm23[2,3],mem[2,3] 7722; AVX512-FCP-NEXT: vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23 7723; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 7724; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm2 7725; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm6 ^ (ymm2 & (ymm23 ^ ymm6)) 7726; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] 7727; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 7728; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm17 & ymm4) 7729; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 7730; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload 7731; AVX512-FCP-NEXT: # zmm0 = mem ^ (zmm17 & (zmm0 ^ mem)) 7732; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] 7733; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm3 & (zmm15 ^ zmm0)) 7734; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] 7735; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm1 7736; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] 7737; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7738; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 7739; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm21 & ymm4) 7740; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm2 7741; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload 7742; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm17 & (zmm1 ^ zmm2)) 7743; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm17 7744; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm3 & (zmm17 ^ zmm1)) 7745; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm14, %xmm0 7746; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm1 7747; AVX512-FCP-NEXT: vporq %xmm0, %xmm1, %xmm21 7748; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,1,7,13],zero,zero,zero,xmm11[5,11],zero,zero,zero 7749; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13] 7750; AVX512-FCP-NEXT: vporq %xmm0, %xmm1, %xmm28 7751; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm11 7752; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm12 & (ymm11 ^ ymm26)) 7753; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm0 7754; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u] 7755; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] 7756; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm11, %xmm2 7757; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm25 7758; AVX512-FCP-NEXT: vporq %xmm1, %xmm2, %xmm26 7759; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm14 7760; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm24 ^ (ymm9 & (ymm14 ^ ymm24)) 7761; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm10 7762; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] 7763; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm2 7764; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] 7765; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm4 7766; AVX512-FCP-NEXT: vporq %xmm2, %xmm4, %xmm27 7767; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm31 ^ ymm30)) 7768; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm4 7769; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm6 ^ ymm23)) 7770; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm29 ^ (ymm9 & (ymm22 ^ ymm29)) 7771; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm8 7772; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm1 7773; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm2 7774; AVX512-FCP-NEXT: vpor %xmm1, %xmm2, %xmm7 7775; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] 7776; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 7777; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm22 7778; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u] 7779; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm1 7780; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm3 7781; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15] 7782; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm1 7783; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128] 7784; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm10 7785; AVX512-FCP-NEXT: vpor %xmm1, %xmm10, %xmm10 7786; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128] 7787; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm11 7788; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] 7789; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm16 = (ymm16 & ymm18) | ymm11 7790; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] 7791; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 7792; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm21 = (ymm21 & ymm18) | ymm4 7793; AVX512-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload 7794; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm20 ^ (ymm5 & (ymm19 ^ ymm20)) 7795; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm1 7796; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm4 & ymm18) 7797; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 7798; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 7799; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm1 7800; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[4,10,u,u,u,u,u,u] 7801; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm14 7802; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm14 7803; AVX512-FCP-NEXT: vpor %xmm11, %xmm14, %xmm11 7804; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 7805; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm0 7806; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 7807; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] 7808; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm23 ^ (ymm2 & (ymm6 ^ ymm23)) 7809; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] 7810; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3,4],xmm8[5,6,7] 7811; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 7812; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm9 7813; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm9 & ymm18) 7814; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 7815; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm19 ^ ymm20)) 7816; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 7817; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 7818; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 7819; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm7 & ~ymm9) 7820; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm7 7821; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm11 7822; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm26, %zmm11, %zmm11 7823; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm11 ^ (zmm9 & (zmm8 ^ zmm11)) 7824; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] 7825; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm11 & (zmm7 ^ zmm8)) 7826; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm8 7827; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 7828; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm12, %xmm8 7829; AVX512-FCP-NEXT: vpor %xmm1, %xmm8, %xmm1 7830; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] 7831; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] 7832; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] 7833; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7834; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 7835; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ~ymm9) 7836; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 7837; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm2 7838; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 7839; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm9 & (zmm1 ^ zmm2)) 7840; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm11 & (zmm0 ^ zmm1)) 7841; AVX512-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload 7842; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload 7843; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] 7844; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm2 & (zmm16 ^ zmm1)) 7845; AVX512-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload 7846; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload 7847; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm2 & (zmm21 ^ zmm1)) 7848; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] 7849; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm16)) 7850; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm21)) 7851; AVX512-FCP-NEXT: vmovdqa64 %zmm15, (%rsi) 7852; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) 7853; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rcx) 7854; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%r8) 7855; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%r9) 7856; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 7857; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 7858; AVX512-FCP-NEXT: addq $40, %rsp 7859; AVX512-FCP-NEXT: vzeroupper 7860; AVX512-FCP-NEXT: retq 7861; 7862; AVX512DQ-LABEL: load_i8_stride6_vf64: 7863; AVX512DQ: # %bb.0: 7864; AVX512DQ-NEXT: subq $40, %rsp 7865; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] 7866; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm25 7867; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm26 7868; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm0 7869; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25)) 7870; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] 7871; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm3 7872; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] 7873; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm4 7874; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm6 7875; AVX512DQ-NEXT: vpor %xmm3, %xmm6, %xmm9 7876; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm30 7877; AVX512DQ-NEXT: vmovdqa64 32(%rdi), %ymm31 7878; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm24 7879; AVX512DQ-NEXT: vmovdqa64 160(%rdi), %ymm18 7880; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm6 7881; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm18 ^ ymm24)) 7882; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm7 7883; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] 7884; AVX512DQ-NEXT: vpshufb %xmm3, %xmm7, %xmm10 7885; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128] 7886; AVX512DQ-NEXT: vpshufb %xmm8, %xmm6, %xmm13 7887; AVX512DQ-NEXT: vpor %xmm10, %xmm13, %xmm10 7888; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 7889; AVX512DQ-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm2 7890; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7891; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm9 7892; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm30 ^ ymm31)) 7893; AVX512DQ-NEXT: vpshufb %xmm1, %xmm9, %xmm1 7894; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm13 7895; AVX512DQ-NEXT: vpshufb %xmm5, %xmm13, %xmm5 7896; AVX512DQ-NEXT: vporq %xmm1, %xmm5, %xmm17 7897; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm29 7898; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %ymm22 7899; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm1 7900; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm29 ^ (ymm1 & (ymm22 ^ ymm29)) 7901; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm5 7902; AVX512DQ-NEXT: vpshufb %xmm3, %xmm5, %xmm3 7903; AVX512DQ-NEXT: vpshufb %xmm8, %xmm1, %xmm8 7904; AVX512DQ-NEXT: vpor %xmm3, %xmm8, %xmm3 7905; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 7906; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u] 7907; AVX512DQ-NEXT: vpshufb %xmm8, %xmm0, %xmm0 7908; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u] 7909; AVX512DQ-NEXT: vpshufb %xmm10, %xmm4, %xmm4 7910; AVX512DQ-NEXT: vpor %xmm0, %xmm4, %xmm0 7911; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7912; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11] 7913; AVX512DQ-NEXT: vpshufb %xmm0, %xmm7, %xmm4 7914; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128] 7915; AVX512DQ-NEXT: vpshufb %xmm7, %xmm6, %xmm6 7916; AVX512DQ-NEXT: vporq %xmm4, %xmm6, %xmm28 7917; AVX512DQ-NEXT: vpshufb %xmm8, %xmm9, %xmm4 7918; AVX512DQ-NEXT: vpshufb %xmm10, %xmm13, %xmm6 7919; AVX512DQ-NEXT: vporq %xmm4, %xmm6, %xmm21 7920; AVX512DQ-NEXT: vpshufb %xmm0, %xmm5, %xmm0 7921; AVX512DQ-NEXT: vpshufb %xmm7, %xmm1, %xmm1 7922; AVX512DQ-NEXT: vporq %xmm0, %xmm1, %xmm27 7923; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] 7924; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] 7925; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm4 7926; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm26 ^ (ymm4 & (ymm25 ^ ymm26)) 7927; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm15 7928; AVX512DQ-NEXT: vpshufb %xmm0, %xmm15, %xmm1 7929; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] 7930; AVX512DQ-NEXT: vpshufb %xmm6, %xmm4, %xmm5 7931; AVX512DQ-NEXT: vpor %xmm1, %xmm5, %xmm1 7932; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7933; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm5 7934; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm18 ^ (ymm5 & (ymm24 ^ ymm18)) 7935; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] 7936; AVX512DQ-NEXT: vpshufb %xmm8, %xmm5, %xmm7 7937; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm1 7938; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12] 7939; AVX512DQ-NEXT: vpshufb %xmm10, %xmm1, %xmm13 7940; AVX512DQ-NEXT: vpor %xmm7, %xmm13, %xmm2 7941; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7942; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm13 7943; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm30 ^ (ymm13 & (ymm31 ^ ymm30)) 7944; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm14 7945; AVX512DQ-NEXT: vpshufb %xmm0, %xmm14, %xmm0 7946; AVX512DQ-NEXT: vpshufb %xmm6, %xmm13, %xmm6 7947; AVX512DQ-NEXT: vporq %xmm0, %xmm6, %xmm16 7948; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm11 7949; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm22 ^ (ymm11 & (ymm29 ^ ymm22)) 7950; AVX512DQ-NEXT: vpshufb %xmm8, %xmm11, %xmm8 7951; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm7 7952; AVX512DQ-NEXT: vpshufb %xmm10, %xmm7, %xmm10 7953; AVX512DQ-NEXT: vpor %xmm8, %xmm10, %xmm0 7954; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7955; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u] 7956; AVX512DQ-NEXT: vpshufb %xmm10, %xmm15, %xmm15 7957; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] 7958; AVX512DQ-NEXT: vpshufb %xmm8, %xmm4, %xmm4 7959; AVX512DQ-NEXT: vpor %xmm4, %xmm15, %xmm0 7960; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7961; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero 7962; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[1,7,13] 7963; AVX512DQ-NEXT: vpor %xmm1, %xmm15, %xmm0 7964; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7965; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm1 7966; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} ymm19 = ymm1[2,3],mem[2,3] 7967; AVX512DQ-NEXT: vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20 7968; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] 7969; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm1 7970; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm19 ^ (ymm1 & (ymm20 ^ ymm19)) 7971; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] 7972; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15] 7973; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7] 7974; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm23 7975; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} ymm6 = ymm23[2,3],mem[2,3] 7976; AVX512DQ-NEXT: vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23 7977; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 7978; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2 7979; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm6 ^ (ymm2 & (ymm23 ^ ymm6)) 7980; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] 7981; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 7982; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm17 & ymm4) 7983; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 7984; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload 7985; AVX512DQ-NEXT: # zmm0 = mem ^ (zmm17 & (zmm0 ^ mem)) 7986; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] 7987; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm3 & (zmm15 ^ zmm0)) 7988; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] 7989; AVX512DQ-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm1 7990; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] 7991; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7992; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 7993; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm21 & ymm4) 7994; AVX512DQ-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm2 7995; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload 7996; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm17 & (zmm1 ^ zmm2)) 7997; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm17 7998; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm3 & (zmm17 ^ zmm1)) 7999; AVX512DQ-NEXT: vpshufb %xmm10, %xmm14, %xmm0 8000; AVX512DQ-NEXT: vpshufb %xmm8, %xmm13, %xmm1 8001; AVX512DQ-NEXT: vporq %xmm0, %xmm1, %xmm21 8002; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,1,7,13],zero,zero,zero,xmm11[5,11],zero,zero,zero 8003; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13] 8004; AVX512DQ-NEXT: vporq %xmm0, %xmm1, %xmm28 8005; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm11 8006; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm12 & (ymm11 ^ ymm26)) 8007; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm0 8008; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u] 8009; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] 8010; AVX512DQ-NEXT: vpshufb %xmm3, %xmm11, %xmm2 8011; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm25 8012; AVX512DQ-NEXT: vporq %xmm1, %xmm2, %xmm26 8013; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm14 8014; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm24 ^ (ymm9 & (ymm14 ^ ymm24)) 8015; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm10 8016; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] 8017; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm2 8018; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] 8019; AVX512DQ-NEXT: vpshufb %xmm3, %xmm14, %xmm4 8020; AVX512DQ-NEXT: vporq %xmm2, %xmm4, %xmm27 8021; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm31 ^ ymm30)) 8022; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm4 8023; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm6 ^ ymm23)) 8024; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm29 ^ (ymm9 & (ymm22 ^ ymm29)) 8025; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm8 8026; AVX512DQ-NEXT: vpshufb %xmm1, %xmm8, %xmm1 8027; AVX512DQ-NEXT: vpshufb %xmm3, %xmm9, %xmm2 8028; AVX512DQ-NEXT: vpor %xmm1, %xmm2, %xmm7 8029; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] 8030; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm0 8031; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm22 8032; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u] 8033; AVX512DQ-NEXT: vpshufb %xmm13, %xmm11, %xmm1 8034; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm3 8035; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15] 8036; AVX512DQ-NEXT: vpshufb %xmm2, %xmm10, %xmm1 8037; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128] 8038; AVX512DQ-NEXT: vpshufb %xmm0, %xmm14, %xmm10 8039; AVX512DQ-NEXT: vpor %xmm1, %xmm10, %xmm10 8040; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128] 8041; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm11 8042; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] 8043; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm16 = (ymm16 & ymm18) | ymm11 8044; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] 8045; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm4 8046; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm21 = (ymm21 & ymm18) | ymm4 8047; AVX512DQ-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload 8048; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm20 ^ (ymm5 & (ymm19 ^ ymm20)) 8049; AVX512DQ-NEXT: vpshufb %ymm1, %ymm5, %ymm1 8050; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm4 & ymm18) 8051; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 8052; AVX512DQ-NEXT: vpshufb %ymm11, %ymm5, %ymm5 8053; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm1 8054; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[4,10,u,u,u,u,u,u] 8055; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm14 8056; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm14 8057; AVX512DQ-NEXT: vpor %xmm11, %xmm14, %xmm11 8058; AVX512DQ-NEXT: vpshufb %xmm2, %xmm8, %xmm2 8059; AVX512DQ-NEXT: vpshufb %xmm0, %xmm9, %xmm0 8060; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0 8061; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] 8062; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm23 ^ (ymm2 & (ymm6 ^ ymm23)) 8063; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] 8064; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3,4],xmm8[5,6,7] 8065; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 8066; AVX512DQ-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm9 8067; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm9 & ymm18) 8068; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 8069; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm19 ^ ymm20)) 8070; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 8071; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 8072; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 8073; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm7 & ~ymm9) 8074; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm7 8075; AVX512DQ-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm11 8076; AVX512DQ-NEXT: vinserti32x4 $2, %xmm26, %zmm11, %zmm11 8077; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm11 ^ (zmm9 & (zmm8 ^ zmm11)) 8078; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] 8079; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm11 & (zmm7 ^ zmm8)) 8080; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm8 8081; AVX512DQ-NEXT: vpshufb %xmm8, %xmm1, %xmm1 8082; AVX512DQ-NEXT: vpshufb %xmm13, %xmm12, %xmm8 8083; AVX512DQ-NEXT: vpor %xmm1, %xmm8, %xmm1 8084; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] 8085; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] 8086; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] 8087; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8088; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 8089; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ~ymm9) 8090; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 8091; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm2 8092; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 8093; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm9 & (zmm1 ^ zmm2)) 8094; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm11 & (zmm0 ^ zmm1)) 8095; AVX512DQ-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload 8096; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload 8097; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] 8098; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm2 & (zmm16 ^ zmm1)) 8099; AVX512DQ-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload 8100; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload 8101; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm2 & (zmm21 ^ zmm1)) 8102; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] 8103; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm16)) 8104; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm21)) 8105; AVX512DQ-NEXT: vmovdqa64 %zmm15, (%rsi) 8106; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rdx) 8107; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rcx) 8108; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%r8) 8109; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%r9) 8110; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax 8111; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax) 8112; AVX512DQ-NEXT: addq $40, %rsp 8113; AVX512DQ-NEXT: vzeroupper 8114; AVX512DQ-NEXT: retq 8115; 8116; AVX512DQ-FCP-LABEL: load_i8_stride6_vf64: 8117; AVX512DQ-FCP: # %bb.0: 8118; AVX512DQ-FCP-NEXT: subq $40, %rsp 8119; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] 8120; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm25 8121; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26 8122; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm0 8123; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25)) 8124; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] 8125; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm3 8126; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] 8127; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 8128; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm6 8129; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm6, %xmm9 8130; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm30 8131; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdi), %ymm31 8132; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm24 8133; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm18 8134; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm6 8135; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm18 ^ ymm24)) 8136; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 8137; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] 8138; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm10 8139; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128] 8140; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm13 8141; AVX512DQ-FCP-NEXT: vpor %xmm10, %xmm13, %xmm10 8142; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 8143; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm2 8144; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8145; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm9 8146; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm30 ^ ymm31)) 8147; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1 8148; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm13 8149; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm13, %xmm5 8150; AVX512DQ-FCP-NEXT: vporq %xmm1, %xmm5, %xmm17 8151; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm29 8152; AVX512DQ-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22 8153; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm1 8154; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm29 ^ (ymm1 & (ymm22 ^ ymm29)) 8155; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 8156; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 8157; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm8 8158; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3 8159; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 8160; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u] 8161; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm0 8162; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u] 8163; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm4 8164; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm4, %xmm0 8165; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8166; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11] 8167; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm4 8168; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128] 8169; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6 8170; AVX512DQ-FCP-NEXT: vporq %xmm4, %xmm6, %xmm28 8171; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm4 8172; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm6 8173; AVX512DQ-FCP-NEXT: vporq %xmm4, %xmm6, %xmm21 8174; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm0 8175; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 8176; AVX512DQ-FCP-NEXT: vporq %xmm0, %xmm1, %xmm27 8177; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] 8178; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] 8179; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm4 8180; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm26 ^ (ymm4 & (ymm25 ^ ymm26)) 8181; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm15 8182; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm1 8183; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] 8184; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm5 8185; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1 8186; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8187; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm5 8188; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm18 ^ (ymm5 & (ymm24 ^ ymm18)) 8189; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] 8190; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm7 8191; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1 8192; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12] 8193; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm13 8194; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm13, %xmm2 8195; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8196; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm13 8197; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm30 ^ (ymm13 & (ymm31 ^ ymm30)) 8198; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 8199; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm0 8200; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm6 8201; AVX512DQ-FCP-NEXT: vporq %xmm0, %xmm6, %xmm16 8202; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm11 8203; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm22 ^ (ymm11 & (ymm29 ^ ymm22)) 8204; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm11, %xmm8 8205; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm7 8206; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm10 8207; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm10, %xmm0 8208; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8209; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u] 8210; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm15 8211; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] 8212; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm4 8213; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm15, %xmm0 8214; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8215; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero 8216; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[1,7,13] 8217; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm15, %xmm0 8218; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8219; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm1 8220; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} ymm19 = ymm1[2,3],mem[2,3] 8221; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20 8222; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0] 8223; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm1 8224; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm19 ^ (ymm1 & (ymm20 ^ ymm19)) 8225; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u] 8226; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15] 8227; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7] 8228; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm23 8229; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} ymm6 = ymm23[2,3],mem[2,3] 8230; AVX512DQ-FCP-NEXT: vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23 8231; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 8232; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2 8233; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm6 ^ (ymm2 & (ymm23 ^ ymm6)) 8234; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255] 8235; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 8236; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm17 & ymm4) 8237; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] 8238; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload 8239; AVX512DQ-FCP-NEXT: # zmm0 = mem ^ (zmm17 & (zmm0 ^ mem)) 8240; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] 8241; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm3 & (zmm15 ^ zmm0)) 8242; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u] 8243; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm1 8244; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] 8245; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 8246; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 8247; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm21 & ymm4) 8248; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm2 8249; AVX512DQ-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload 8250; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm17 & (zmm1 ^ zmm2)) 8251; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm17 8252; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm3 & (zmm17 ^ zmm1)) 8253; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm14, %xmm0 8254; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm1 8255; AVX512DQ-FCP-NEXT: vporq %xmm0, %xmm1, %xmm21 8256; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,1,7,13],zero,zero,zero,xmm11[5,11],zero,zero,zero 8257; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13] 8258; AVX512DQ-FCP-NEXT: vporq %xmm0, %xmm1, %xmm28 8259; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm11 8260; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm12 & (ymm11 ^ ymm26)) 8261; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm0 8262; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u] 8263; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] 8264; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm11, %xmm2 8265; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm25 8266; AVX512DQ-FCP-NEXT: vporq %xmm1, %xmm2, %xmm26 8267; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm14 8268; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm24 ^ (ymm9 & (ymm14 ^ ymm24)) 8269; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm10 8270; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] 8271; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm2 8272; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] 8273; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm4 8274; AVX512DQ-FCP-NEXT: vporq %xmm2, %xmm4, %xmm27 8275; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm31 ^ ymm30)) 8276; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm4 8277; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm6 ^ ymm23)) 8278; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm29 ^ (ymm9 & (ymm22 ^ ymm29)) 8279; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm8 8280; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm1 8281; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm2 8282; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm2, %xmm7 8283; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] 8284; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 8285; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm22 8286; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u] 8287; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm1 8288; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm3 8289; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15] 8290; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm1 8291; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128] 8292; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm10 8293; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm10, %xmm10 8294; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128] 8295; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm11 8296; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] 8297; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm16 = (ymm16 & ymm18) | ymm11 8298; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128] 8299; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 8300; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm21 = (ymm21 & ymm18) | ymm4 8301; AVX512DQ-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload 8302; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm20 ^ (ymm5 & (ymm19 ^ ymm20)) 8303; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm1 8304; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm4 & ymm18) 8305; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 8306; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 8307; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm1 8308; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[4,10,u,u,u,u,u,u] 8309; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm14 8310; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm14 8311; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm14, %xmm11 8312; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 8313; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm0 8314; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 8315; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] 8316; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm23 ^ (ymm2 & (ymm6 ^ ymm23)) 8317; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u] 8318; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3,4],xmm8[5,6,7] 8319; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 8320; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm9 8321; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm9 & ymm18) 8322; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 8323; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm19 ^ ymm20)) 8324; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 8325; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 8326; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 8327; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm7 & ~ymm9) 8328; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm7 8329; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm11 8330; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm26, %zmm11, %zmm11 8331; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm11 ^ (zmm9 & (zmm8 ^ zmm11)) 8332; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0] 8333; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm11 & (zmm7 ^ zmm8)) 8334; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm8 8335; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1 8336; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm12, %xmm8 8337; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm8, %xmm1 8338; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u] 8339; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] 8340; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] 8341; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8342; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 8343; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ~ymm9) 8344; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 8345; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm2 8346; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 8347; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm9 & (zmm1 ^ zmm2)) 8348; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm11 & (zmm0 ^ zmm1)) 8349; AVX512DQ-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload 8350; AVX512DQ-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload 8351; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] 8352; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm2 & (zmm16 ^ zmm1)) 8353; AVX512DQ-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload 8354; AVX512DQ-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload 8355; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm2 & (zmm21 ^ zmm1)) 8356; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0] 8357; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm16)) 8358; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm21)) 8359; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, (%rsi) 8360; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) 8361; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rcx) 8362; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%r8) 8363; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%r9) 8364; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 8365; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax) 8366; AVX512DQ-FCP-NEXT: addq $40, %rsp 8367; AVX512DQ-FCP-NEXT: vzeroupper 8368; AVX512DQ-FCP-NEXT: retq 8369; 8370; AVX512BW-LABEL: load_i8_stride6_vf64: 8371; AVX512BW: # %bb.0: 8372; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 8373; AVX512BW-NEXT: vmovdqa 224(%rdi), %ymm0 8374; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm23 8375; AVX512BW-NEXT: movw $18724, %r10w # imm = 0x4924 8376; AVX512BW-NEXT: kmovd %r10d, %k1 8377; AVX512BW-NEXT: vpblendmw %ymm0, %ymm23, %ymm9 {%k1} 8378; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] 8379; AVX512BW-NEXT: vpshufb %xmm2, %xmm9, %xmm1 8380; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] 8381; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm12 8382; AVX512BW-NEXT: vpshufb %xmm4, %xmm12, %xmm3 8383; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm5 8384; AVX512BW-NEXT: vmovdqa (%rdi), %ymm10 8385; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm3 8386; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm6 8387; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm26 8388; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm1 8389; AVX512BW-NEXT: vpblendmw %ymm26, %ymm1, %ymm15 {%k1} 8390; AVX512BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16 8391; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] 8392; AVX512BW-NEXT: vpshufb %xmm17, %xmm16, %xmm11 8393; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128] 8394; AVX512BW-NEXT: vpshufb %xmm18, %xmm15, %xmm13 8395; AVX512BW-NEXT: vpor %xmm11, %xmm13, %xmm11 8396; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 8397; AVX512BW-NEXT: vinserti32x4 $2, %xmm5, %zmm11, %zmm11 8398; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],mem[2,3] 8399; AVX512BW-NEXT: vinserti128 $1, 96(%rdi), %ymm6, %ymm13 8400; AVX512BW-NEXT: movw $-28124, %r10w # imm = 0x9224 8401; AVX512BW-NEXT: kmovd %r10d, %k4 8402; AVX512BW-NEXT: vpblendmw %ymm5, %ymm13, %ymm19 {%k4} 8403; AVX512BW-NEXT: vpblendmw %ymm3, %ymm10, %ymm20 {%k1} 8404; AVX512BW-NEXT: vpshufb %xmm2, %xmm20, %xmm2 8405; AVX512BW-NEXT: vextracti32x4 $1, %ymm20, %xmm21 8406; AVX512BW-NEXT: vpshufb %xmm4, %xmm21, %xmm4 8407; AVX512BW-NEXT: vpor %xmm2, %xmm4, %xmm2 8408; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10] 8409; AVX512BW-NEXT: movl $4192256, %r10d # imm = 0x3FF800 8410; AVX512BW-NEXT: kmovd %r10d, %k2 8411; AVX512BW-NEXT: vpshufb %ymm6, %ymm19, %ymm2 {%k2} 8412; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm2 {%k2} 8413; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm11 8414; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm11[2,3],mem[2,3] 8415; AVX512BW-NEXT: vinserti128 $1, 288(%rdi), %ymm11, %ymm14 8416; AVX512BW-NEXT: vpblendmw %ymm4, %ymm14, %ymm22 {%k4} 8417; AVX512BW-NEXT: vpshufb %ymm6, %ymm22, %ymm7 8418; AVX512BW-NEXT: vmovdqa 320(%rdi), %ymm11 8419; AVX512BW-NEXT: vmovdqa 352(%rdi), %ymm6 8420; AVX512BW-NEXT: vpblendmw %ymm11, %ymm6, %ymm24 {%k1} 8421; AVX512BW-NEXT: vextracti32x4 $1, %ymm24, %xmm25 8422; AVX512BW-NEXT: vpshufb %xmm17, %xmm25, %xmm17 8423; AVX512BW-NEXT: vpshufb %xmm18, %xmm24, %xmm18 8424; AVX512BW-NEXT: vporq %xmm17, %xmm18, %xmm17 8425; AVX512BW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm8 8426; AVX512BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15] 8427; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] 8428; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 8429; AVX512BW-NEXT: movabsq $-8796093022208, %rdi # imm = 0xFFFFF80000000000 8430; AVX512BW-NEXT: kmovq %rdi, %k3 8431; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm2 {%k3} 8432; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u] 8433; AVX512BW-NEXT: vpshufb %xmm7, %xmm9, %xmm8 8434; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u] 8435; AVX512BW-NEXT: vpshufb %xmm9, %xmm12, %xmm12 8436; AVX512BW-NEXT: vpor %xmm8, %xmm12, %xmm8 8437; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11] 8438; AVX512BW-NEXT: vpshufb %xmm12, %xmm16, %xmm16 8439; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128] 8440; AVX512BW-NEXT: vpshufb %xmm17, %xmm15, %xmm15 8441; AVX512BW-NEXT: vporq %xmm16, %xmm15, %xmm15 8442; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 8443; AVX512BW-NEXT: vinserti32x4 $2, %xmm8, %zmm15, %zmm8 8444; AVX512BW-NEXT: vpshufb %xmm7, %xmm20, %xmm7 8445; AVX512BW-NEXT: vpshufb %xmm9, %xmm21, %xmm9 8446; AVX512BW-NEXT: vpor %xmm7, %xmm9, %xmm9 8447; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11] 8448; AVX512BW-NEXT: vpshufb %ymm7, %ymm19, %ymm9 {%k2} 8449; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} 8450; AVX512BW-NEXT: vpshufb %ymm7, %ymm22, %ymm7 8451; AVX512BW-NEXT: vpshufb %xmm12, %xmm25, %xmm8 8452; AVX512BW-NEXT: vpshufb %xmm17, %xmm24, %xmm12 8453; AVX512BW-NEXT: vpor %xmm8, %xmm12, %xmm8 8454; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 8455; AVX512BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15] 8456; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] 8457; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 8458; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm9 {%k3} 8459; AVX512BW-NEXT: vpblendmw %ymm13, %ymm5, %ymm15 {%k4} 8460; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] 8461; AVX512BW-NEXT: movw $9362, %di # imm = 0x2492 8462; AVX512BW-NEXT: kmovd %edi, %k2 8463; AVX512BW-NEXT: vpblendmw %ymm10, %ymm3, %ymm8 {%k2} 8464; AVX512BW-NEXT: vextracti32x4 $1, %ymm8, %xmm16 8465; AVX512BW-NEXT: vpshufb %xmm7, %xmm16, %xmm12 8466; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] 8467; AVX512BW-NEXT: vpshufb %xmm17, %xmm8, %xmm18 8468; AVX512BW-NEXT: vporq %xmm12, %xmm18, %xmm18 8469; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm19 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] 8470; AVX512BW-NEXT: movl $2095104, %edi # imm = 0x1FF800 8471; AVX512BW-NEXT: kmovd %edi, %k5 8472; AVX512BW-NEXT: vpshufb %ymm19, %ymm15, %ymm18 {%k5} 8473; AVX512BW-NEXT: vpblendmw %ymm23, %ymm0, %ymm20 {%k2} 8474; AVX512BW-NEXT: vextracti32x4 $1, %ymm20, %xmm21 8475; AVX512BW-NEXT: vpshufb %xmm7, %xmm21, %xmm7 8476; AVX512BW-NEXT: vpshufb %xmm17, %xmm20, %xmm12 8477; AVX512BW-NEXT: vpor %xmm7, %xmm12, %xmm7 8478; AVX512BW-NEXT: vpblendmw %ymm1, %ymm26, %ymm17 {%k1} 8479; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm22 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] 8480; AVX512BW-NEXT: vpshufb %xmm22, %xmm17, %xmm12 8481; AVX512BW-NEXT: vextracti32x4 $1, %ymm17, %xmm24 8482; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm25 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12] 8483; AVX512BW-NEXT: vpshufb %xmm25, %xmm24, %xmm27 8484; AVX512BW-NEXT: vporq %xmm12, %xmm27, %xmm12 8485; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 8486; AVX512BW-NEXT: vinserti32x4 $2, %xmm7, %zmm12, %zmm12 8487; AVX512BW-NEXT: movl $2097151, %edi # imm = 0x1FFFFF 8488; AVX512BW-NEXT: kmovq %rdi, %k6 8489; AVX512BW-NEXT: vmovdqu8 %zmm18, %zmm12 {%k6} 8490; AVX512BW-NEXT: vpblendmw %ymm14, %ymm4, %ymm7 {%k4} 8491; AVX512BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm18 {%k1} 8492; AVX512BW-NEXT: vpshufb %xmm22, %xmm18, %xmm22 8493; AVX512BW-NEXT: vextracti32x4 $1, %ymm18, %xmm27 8494; AVX512BW-NEXT: vpshufb %xmm25, %xmm27, %xmm25 8495; AVX512BW-NEXT: vporq %xmm22, %xmm25, %xmm22 8496; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 8497; AVX512BW-NEXT: vpshufb %ymm19, %ymm7, %ymm22 {%k5} 8498; AVX512BW-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm19 8499; AVX512BW-NEXT: vmovdqu8 %zmm19, %zmm12 {%k3} 8500; AVX512BW-NEXT: movw $9289, %di # imm = 0x2449 8501; AVX512BW-NEXT: kmovd %edi, %k4 8502; AVX512BW-NEXT: vmovdqu16 %ymm14, %ymm4 {%k4} 8503; AVX512BW-NEXT: vmovdqu16 %ymm13, %ymm5 {%k4} 8504; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u] 8505; AVX512BW-NEXT: vpshufb %xmm13, %xmm16, %xmm14 8506; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm16 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] 8507; AVX512BW-NEXT: vpshufb %xmm16, %xmm8, %xmm8 8508; AVX512BW-NEXT: vpor %xmm14, %xmm8, %xmm8 8509; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm14 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13] 8510; AVX512BW-NEXT: vpshufb %ymm14, %ymm15, %ymm8 {%k5} 8511; AVX512BW-NEXT: vpshufb %xmm13, %xmm21, %xmm13 8512; AVX512BW-NEXT: vpshufb %xmm16, %xmm20, %xmm15 8513; AVX512BW-NEXT: vpor %xmm13, %xmm15, %xmm13 8514; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128] 8515; AVX512BW-NEXT: vpshufb %xmm15, %xmm17, %xmm16 8516; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13] 8517; AVX512BW-NEXT: vpshufb %xmm17, %xmm24, %xmm19 8518; AVX512BW-NEXT: vporq %xmm16, %xmm19, %xmm16 8519; AVX512BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 8520; AVX512BW-NEXT: vinserti32x4 $2, %xmm13, %zmm16, %zmm13 8521; AVX512BW-NEXT: vmovdqu8 %zmm8, %zmm13 {%k6} 8522; AVX512BW-NEXT: vpshufb %xmm15, %xmm18, %xmm8 8523; AVX512BW-NEXT: vpshufb %xmm17, %xmm27, %xmm15 8524; AVX512BW-NEXT: vpor %xmm8, %xmm15, %xmm8 8525; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 8526; AVX512BW-NEXT: vpshufb %ymm14, %ymm7, %ymm8 {%k5} 8527; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm7 8528; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm13 {%k3} 8529; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] 8530; AVX512BW-NEXT: vpshufb %ymm7, %ymm5, %ymm8 8531; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u] 8532; AVX512BW-NEXT: vmovdqu16 %ymm10, %ymm3 {%k1} 8533; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm15 8534; AVX512BW-NEXT: vpshufb %xmm14, %xmm15, %xmm10 8535; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm16 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] 8536; AVX512BW-NEXT: vpshufb %xmm16, %xmm3, %xmm17 8537; AVX512BW-NEXT: vporq %xmm10, %xmm17, %xmm10 8538; AVX512BW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm8[5,6,7] 8539; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7] 8540; AVX512BW-NEXT: vmovdqu16 %ymm23, %ymm0 {%k1} 8541; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm8 8542; AVX512BW-NEXT: vpshufb %xmm14, %xmm8, %xmm14 8543; AVX512BW-NEXT: vpshufb %xmm16, %xmm0, %xmm16 8544; AVX512BW-NEXT: vporq %xmm14, %xmm16, %xmm14 8545; AVX512BW-NEXT: vmovdqu16 %ymm26, %ymm1 {%k2} 8546; AVX512BW-NEXT: vextracti32x4 $1, %ymm1, %xmm16 8547; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] 8548; AVX512BW-NEXT: vpshufb %xmm17, %xmm16, %xmm18 8549; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm19 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] 8550; AVX512BW-NEXT: vpshufb %xmm19, %xmm1, %xmm20 8551; AVX512BW-NEXT: vporq %xmm18, %xmm20, %xmm18 8552; AVX512BW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 8553; AVX512BW-NEXT: vinserti32x4 $2, %xmm14, %zmm18, %zmm14 8554; AVX512BW-NEXT: movabsq $4398044413952, %rdi # imm = 0x3FFFFE00000 8555; AVX512BW-NEXT: kmovq %rdi, %k1 8556; AVX512BW-NEXT: vmovdqu8 %zmm14, %zmm10 {%k1} 8557; AVX512BW-NEXT: vpshufb %ymm7, %ymm4, %ymm7 8558; AVX512BW-NEXT: vmovdqu16 %ymm11, %ymm6 {%k2} 8559; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm11 8560; AVX512BW-NEXT: vpshufb %xmm17, %xmm11, %xmm14 8561; AVX512BW-NEXT: vpshufb %xmm19, %xmm6, %xmm17 8562; AVX512BW-NEXT: vporq %xmm14, %xmm17, %xmm14 8563; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 8564; AVX512BW-NEXT: movl $-2097152, %edi # imm = 0xFFE00000 8565; AVX512BW-NEXT: kmovd %edi, %k2 8566; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm7 {%k2} 8567; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 8568; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm10 {%k2} 8569; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15] 8570; AVX512BW-NEXT: vpshufb %ymm7, %ymm5, %ymm5 8571; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] 8572; AVX512BW-NEXT: vpshufb %xmm14, %xmm15, %xmm15 8573; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u] 8574; AVX512BW-NEXT: vpshufb %xmm17, %xmm3, %xmm3 8575; AVX512BW-NEXT: vpor %xmm3, %xmm15, %xmm3 8576; AVX512BW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] 8577; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] 8578; AVX512BW-NEXT: vpshufb %xmm14, %xmm8, %xmm5 8579; AVX512BW-NEXT: vpshufb %xmm17, %xmm0, %xmm0 8580; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0 8581; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15] 8582; AVX512BW-NEXT: vpshufb %xmm5, %xmm16, %xmm8 8583; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128] 8584; AVX512BW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 8585; AVX512BW-NEXT: vpor %xmm1, %xmm8, %xmm1 8586; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 8587; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 8588; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1} 8589; AVX512BW-NEXT: vpshufb %ymm7, %ymm4, %ymm0 8590; AVX512BW-NEXT: vpshufb %xmm5, %xmm11, %xmm1 8591; AVX512BW-NEXT: vpshufb %xmm14, %xmm6, %xmm4 8592; AVX512BW-NEXT: vpor %xmm1, %xmm4, %xmm1 8593; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 8594; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} 8595; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 8596; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k2} 8597; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi) 8598; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx) 8599; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rcx) 8600; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r8) 8601; AVX512BW-NEXT: vmovdqa64 %zmm10, (%r9) 8602; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) 8603; AVX512BW-NEXT: vzeroupper 8604; AVX512BW-NEXT: retq 8605; 8606; AVX512BW-FCP-LABEL: load_i8_stride6_vf64: 8607; AVX512BW-FCP: # %bb.0: 8608; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 8609; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm0 8610; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm23 8611; AVX512BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 8612; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 8613; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm23, %ymm9 {%k1} 8614; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] 8615; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm1 8616; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] 8617; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm12 8618; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm12, %xmm3 8619; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm3, %xmm5 8620; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm10 8621; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 8622; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 8623; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm26 8624; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 8625; AVX512BW-FCP-NEXT: vpblendmw %ymm26, %ymm1, %ymm15 {%k1} 8626; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 8627; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] 8628; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm16, %xmm11 8629; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm18 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128] 8630; AVX512BW-FCP-NEXT: vpshufb %xmm18, %xmm15, %xmm13 8631; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm13, %xmm11 8632; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 8633; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm11, %zmm11 8634; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],mem[2,3] 8635; AVX512BW-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm6, %ymm13 8636; AVX512BW-FCP-NEXT: movw $-28124, %r10w # imm = 0x9224 8637; AVX512BW-FCP-NEXT: kmovd %r10d, %k4 8638; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm13, %ymm19 {%k4} 8639; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm20 {%k1} 8640; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm20, %xmm2 8641; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm21 8642; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm21, %xmm4 8643; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2 8644; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10] 8645; AVX512BW-FCP-NEXT: movl $4192256, %r10d # imm = 0x3FF800 8646; AVX512BW-FCP-NEXT: kmovd %r10d, %k2 8647; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm19, %ymm2 {%k2} 8648; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm2 {%k2} 8649; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm11 8650; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm11[2,3],mem[2,3] 8651; AVX512BW-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm11, %ymm14 8652; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm14, %ymm22 {%k4} 8653; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm22, %ymm7 8654; AVX512BW-FCP-NEXT: vmovdqa 320(%rdi), %ymm11 8655; AVX512BW-FCP-NEXT: vmovdqa 352(%rdi), %ymm6 8656; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm6, %ymm24 {%k1} 8657; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm24, %xmm25 8658; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm25, %xmm17 8659; AVX512BW-FCP-NEXT: vpshufb %xmm18, %xmm24, %xmm18 8660; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm17 8661; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm8 8662; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15] 8663; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] 8664; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 8665; AVX512BW-FCP-NEXT: movabsq $-8796093022208, %rdi # imm = 0xFFFFF80000000000 8666; AVX512BW-FCP-NEXT: kmovq %rdi, %k3 8667; AVX512BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm2 {%k3} 8668; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u] 8669; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm8 8670; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u] 8671; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm12, %xmm12 8672; AVX512BW-FCP-NEXT: vpor %xmm8, %xmm12, %xmm8 8673; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11] 8674; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm16, %xmm16 8675; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128] 8676; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm15, %xmm15 8677; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 8678; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 8679; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm15, %zmm8 8680; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm20, %xmm7 8681; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm21, %xmm9 8682; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm9, %xmm9 8683; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11] 8684; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm19, %ymm9 {%k2} 8685; AVX512BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} 8686; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm22, %ymm7 8687; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm25, %xmm8 8688; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm24, %xmm12 8689; AVX512BW-FCP-NEXT: vpor %xmm8, %xmm12, %xmm8 8690; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 8691; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15] 8692; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] 8693; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 8694; AVX512BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm9 {%k3} 8695; AVX512BW-FCP-NEXT: vpblendmw %ymm13, %ymm5, %ymm15 {%k4} 8696; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] 8697; AVX512BW-FCP-NEXT: movw $9362, %di # imm = 0x2492 8698; AVX512BW-FCP-NEXT: kmovd %edi, %k2 8699; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm8 {%k2} 8700; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm8, %xmm16 8701; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm16, %xmm12 8702; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] 8703; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm8, %xmm18 8704; AVX512BW-FCP-NEXT: vporq %xmm12, %xmm18, %xmm18 8705; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm19 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] 8706; AVX512BW-FCP-NEXT: movl $2095104, %edi # imm = 0x1FF800 8707; AVX512BW-FCP-NEXT: kmovd %edi, %k5 8708; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm15, %ymm18 {%k5} 8709; AVX512BW-FCP-NEXT: vpblendmw %ymm23, %ymm0, %ymm20 {%k2} 8710; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm21 8711; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm21, %xmm7 8712; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm20, %xmm12 8713; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm12, %xmm7 8714; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm26, %ymm17 {%k1} 8715; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm22 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] 8716; AVX512BW-FCP-NEXT: vpshufb %xmm22, %xmm17, %xmm12 8717; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm24 8718; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm25 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12] 8719; AVX512BW-FCP-NEXT: vpshufb %xmm25, %xmm24, %xmm27 8720; AVX512BW-FCP-NEXT: vporq %xmm12, %xmm27, %xmm12 8721; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 8722; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm12, %zmm12 8723; AVX512BW-FCP-NEXT: movl $2097151, %edi # imm = 0x1FFFFF 8724; AVX512BW-FCP-NEXT: kmovq %rdi, %k6 8725; AVX512BW-FCP-NEXT: vmovdqu8 %zmm18, %zmm12 {%k6} 8726; AVX512BW-FCP-NEXT: vpblendmw %ymm14, %ymm4, %ymm7 {%k4} 8727; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm11, %ymm18 {%k1} 8728; AVX512BW-FCP-NEXT: vpshufb %xmm22, %xmm18, %xmm22 8729; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm27 8730; AVX512BW-FCP-NEXT: vpshufb %xmm25, %xmm27, %xmm25 8731; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm25, %xmm22 8732; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 8733; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm7, %ymm22 {%k5} 8734; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm19 8735; AVX512BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm12 {%k3} 8736; AVX512BW-FCP-NEXT: movw $9289, %di # imm = 0x2449 8737; AVX512BW-FCP-NEXT: kmovd %edi, %k4 8738; AVX512BW-FCP-NEXT: vmovdqu16 %ymm14, %ymm4 {%k4} 8739; AVX512BW-FCP-NEXT: vmovdqu16 %ymm13, %ymm5 {%k4} 8740; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u] 8741; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm16, %xmm14 8742; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm16 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] 8743; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm8, %xmm8 8744; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm8, %xmm8 8745; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm14 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13] 8746; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm8 {%k5} 8747; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm21, %xmm13 8748; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm20, %xmm15 8749; AVX512BW-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13 8750; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128] 8751; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm17, %xmm16 8752; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13] 8753; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm24, %xmm19 8754; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm19, %xmm16 8755; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 8756; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm16, %zmm13 8757; AVX512BW-FCP-NEXT: vmovdqu8 %zmm8, %zmm13 {%k6} 8758; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm18, %xmm8 8759; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm27, %xmm15 8760; AVX512BW-FCP-NEXT: vpor %xmm8, %xmm15, %xmm8 8761; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 8762; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm8 {%k5} 8763; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm7 8764; AVX512BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm13 {%k3} 8765; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] 8766; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm8 8767; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u] 8768; AVX512BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm3 {%k1} 8769; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm15 8770; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm15, %xmm10 8771; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm16 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] 8772; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm3, %xmm17 8773; AVX512BW-FCP-NEXT: vporq %xmm10, %xmm17, %xmm10 8774; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm8[5,6,7] 8775; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7] 8776; AVX512BW-FCP-NEXT: vmovdqu16 %ymm23, %ymm0 {%k1} 8777; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm8 8778; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm14 8779; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm0, %xmm16 8780; AVX512BW-FCP-NEXT: vporq %xmm14, %xmm16, %xmm14 8781; AVX512BW-FCP-NEXT: vmovdqu16 %ymm26, %ymm1 {%k2} 8782; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm1, %xmm16 8783; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] 8784; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm16, %xmm18 8785; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] 8786; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm1, %xmm20 8787; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm20, %xmm18 8788; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 8789; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm18, %zmm14 8790; AVX512BW-FCP-NEXT: movabsq $4398044413952, %rdi # imm = 0x3FFFFE00000 8791; AVX512BW-FCP-NEXT: kmovq %rdi, %k1 8792; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm10 {%k1} 8793; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm7 8794; AVX512BW-FCP-NEXT: vmovdqu16 %ymm11, %ymm6 {%k2} 8795; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm11 8796; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm11, %xmm14 8797; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm6, %xmm17 8798; AVX512BW-FCP-NEXT: vporq %xmm14, %xmm17, %xmm14 8799; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 8800; AVX512BW-FCP-NEXT: movl $-2097152, %edi # imm = 0xFFE00000 8801; AVX512BW-FCP-NEXT: kmovd %edi, %k2 8802; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm7 {%k2} 8803; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 8804; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm10 {%k2} 8805; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15] 8806; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5 8807; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] 8808; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm15, %xmm15 8809; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u] 8810; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm3, %xmm3 8811; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm15, %xmm3 8812; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] 8813; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] 8814; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm5 8815; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm0, %xmm0 8816; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0 8817; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15] 8818; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm16, %xmm8 8819; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128] 8820; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 8821; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm8, %xmm1 8822; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 8823; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 8824; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1} 8825; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm0 8826; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm1 8827; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm4 8828; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm4, %xmm1 8829; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 8830; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} 8831; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 8832; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k2} 8833; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) 8834; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdx) 8835; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx) 8836; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%r8) 8837; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%r9) 8838; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) 8839; AVX512BW-FCP-NEXT: vzeroupper 8840; AVX512BW-FCP-NEXT: retq 8841; 8842; AVX512DQ-BW-LABEL: load_i8_stride6_vf64: 8843; AVX512DQ-BW: # %bb.0: 8844; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax 8845; AVX512DQ-BW-NEXT: vmovdqa 224(%rdi), %ymm0 8846; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %ymm23 8847; AVX512DQ-BW-NEXT: movw $18724, %r10w # imm = 0x4924 8848; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 8849; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm23, %ymm9 {%k1} 8850; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] 8851; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm9, %xmm1 8852; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] 8853; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm9, %xmm12 8854; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm12, %xmm3 8855; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm3, %xmm5 8856; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm10 8857; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm3 8858; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm6 8859; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %ymm26 8860; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm1 8861; AVX512DQ-BW-NEXT: vpblendmw %ymm26, %ymm1, %ymm15 {%k1} 8862; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16 8863; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] 8864; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm16, %xmm11 8865; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128] 8866; AVX512DQ-BW-NEXT: vpshufb %xmm18, %xmm15, %xmm13 8867; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm13, %xmm11 8868; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 8869; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm5, %zmm11, %zmm11 8870; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],mem[2,3] 8871; AVX512DQ-BW-NEXT: vinserti128 $1, 96(%rdi), %ymm6, %ymm13 8872; AVX512DQ-BW-NEXT: movw $-28124, %r10w # imm = 0x9224 8873; AVX512DQ-BW-NEXT: kmovd %r10d, %k4 8874; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm13, %ymm19 {%k4} 8875; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm10, %ymm20 {%k1} 8876; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm20, %xmm2 8877; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm20, %xmm21 8878; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm21, %xmm4 8879; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm4, %xmm2 8880; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10] 8881; AVX512DQ-BW-NEXT: movl $4192256, %r10d # imm = 0x3FF800 8882; AVX512DQ-BW-NEXT: kmovd %r10d, %k2 8883; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm19, %ymm2 {%k2} 8884; AVX512DQ-BW-NEXT: vmovdqu16 %zmm11, %zmm2 {%k2} 8885; AVX512DQ-BW-NEXT: vmovdqa 256(%rdi), %ymm11 8886; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm11[2,3],mem[2,3] 8887; AVX512DQ-BW-NEXT: vinserti128 $1, 288(%rdi), %ymm11, %ymm14 8888; AVX512DQ-BW-NEXT: vpblendmw %ymm4, %ymm14, %ymm22 {%k4} 8889; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm22, %ymm7 8890; AVX512DQ-BW-NEXT: vmovdqa 320(%rdi), %ymm11 8891; AVX512DQ-BW-NEXT: vmovdqa 352(%rdi), %ymm6 8892; AVX512DQ-BW-NEXT: vpblendmw %ymm11, %ymm6, %ymm24 {%k1} 8893; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm24, %xmm25 8894; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm25, %xmm17 8895; AVX512DQ-BW-NEXT: vpshufb %xmm18, %xmm24, %xmm18 8896; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm18, %xmm17 8897; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm8 8898; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15] 8899; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] 8900; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 8901; AVX512DQ-BW-NEXT: movabsq $-8796093022208, %rdi # imm = 0xFFFFF80000000000 8902; AVX512DQ-BW-NEXT: kmovq %rdi, %k3 8903; AVX512DQ-BW-NEXT: vmovdqu8 %zmm7, %zmm2 {%k3} 8904; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u] 8905; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm9, %xmm8 8906; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u] 8907; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm12, %xmm12 8908; AVX512DQ-BW-NEXT: vpor %xmm8, %xmm12, %xmm8 8909; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11] 8910; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm16, %xmm16 8911; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128] 8912; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm15, %xmm15 8913; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm15, %xmm15 8914; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 8915; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm8, %zmm15, %zmm8 8916; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm20, %xmm7 8917; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm21, %xmm9 8918; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm9, %xmm9 8919; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11] 8920; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm19, %ymm9 {%k2} 8921; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} 8922; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm22, %ymm7 8923; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm25, %xmm8 8924; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm24, %xmm12 8925; AVX512DQ-BW-NEXT: vpor %xmm8, %xmm12, %xmm8 8926; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 8927; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15] 8928; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] 8929; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 8930; AVX512DQ-BW-NEXT: vmovdqu8 %zmm7, %zmm9 {%k3} 8931; AVX512DQ-BW-NEXT: vpblendmw %ymm13, %ymm5, %ymm15 {%k4} 8932; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] 8933; AVX512DQ-BW-NEXT: movw $9362, %di # imm = 0x2492 8934; AVX512DQ-BW-NEXT: kmovd %edi, %k2 8935; AVX512DQ-BW-NEXT: vpblendmw %ymm10, %ymm3, %ymm8 {%k2} 8936; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm8, %xmm16 8937; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm16, %xmm12 8938; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] 8939; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm8, %xmm18 8940; AVX512DQ-BW-NEXT: vporq %xmm12, %xmm18, %xmm18 8941; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm19 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] 8942; AVX512DQ-BW-NEXT: movl $2095104, %edi # imm = 0x1FF800 8943; AVX512DQ-BW-NEXT: kmovd %edi, %k5 8944; AVX512DQ-BW-NEXT: vpshufb %ymm19, %ymm15, %ymm18 {%k5} 8945; AVX512DQ-BW-NEXT: vpblendmw %ymm23, %ymm0, %ymm20 {%k2} 8946; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm20, %xmm21 8947; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm21, %xmm7 8948; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm20, %xmm12 8949; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm12, %xmm7 8950; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm26, %ymm17 {%k1} 8951; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm22 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] 8952; AVX512DQ-BW-NEXT: vpshufb %xmm22, %xmm17, %xmm12 8953; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm17, %xmm24 8954; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm25 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12] 8955; AVX512DQ-BW-NEXT: vpshufb %xmm25, %xmm24, %xmm27 8956; AVX512DQ-BW-NEXT: vporq %xmm12, %xmm27, %xmm12 8957; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 8958; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm7, %zmm12, %zmm12 8959; AVX512DQ-BW-NEXT: movl $2097151, %edi # imm = 0x1FFFFF 8960; AVX512DQ-BW-NEXT: kmovq %rdi, %k6 8961; AVX512DQ-BW-NEXT: vmovdqu8 %zmm18, %zmm12 {%k6} 8962; AVX512DQ-BW-NEXT: vpblendmw %ymm14, %ymm4, %ymm7 {%k4} 8963; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm18 {%k1} 8964; AVX512DQ-BW-NEXT: vpshufb %xmm22, %xmm18, %xmm22 8965; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm18, %xmm27 8966; AVX512DQ-BW-NEXT: vpshufb %xmm25, %xmm27, %xmm25 8967; AVX512DQ-BW-NEXT: vporq %xmm22, %xmm25, %xmm22 8968; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 8969; AVX512DQ-BW-NEXT: vpshufb %ymm19, %ymm7, %ymm22 {%k5} 8970; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm19 8971; AVX512DQ-BW-NEXT: vmovdqu8 %zmm19, %zmm12 {%k3} 8972; AVX512DQ-BW-NEXT: movw $9289, %di # imm = 0x2449 8973; AVX512DQ-BW-NEXT: kmovd %edi, %k4 8974; AVX512DQ-BW-NEXT: vmovdqu16 %ymm14, %ymm4 {%k4} 8975; AVX512DQ-BW-NEXT: vmovdqu16 %ymm13, %ymm5 {%k4} 8976; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u] 8977; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm16, %xmm14 8978; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm16 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] 8979; AVX512DQ-BW-NEXT: vpshufb %xmm16, %xmm8, %xmm8 8980; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm8, %xmm8 8981; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm14 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13] 8982; AVX512DQ-BW-NEXT: vpshufb %ymm14, %ymm15, %ymm8 {%k5} 8983; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm21, %xmm13 8984; AVX512DQ-BW-NEXT: vpshufb %xmm16, %xmm20, %xmm15 8985; AVX512DQ-BW-NEXT: vpor %xmm13, %xmm15, %xmm13 8986; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128] 8987; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm17, %xmm16 8988; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13] 8989; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm24, %xmm19 8990; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm19, %xmm16 8991; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 8992; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm13, %zmm16, %zmm13 8993; AVX512DQ-BW-NEXT: vmovdqu8 %zmm8, %zmm13 {%k6} 8994; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm18, %xmm8 8995; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm27, %xmm15 8996; AVX512DQ-BW-NEXT: vpor %xmm8, %xmm15, %xmm8 8997; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 8998; AVX512DQ-BW-NEXT: vpshufb %ymm14, %ymm7, %ymm8 {%k5} 8999; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm7 9000; AVX512DQ-BW-NEXT: vmovdqu8 %zmm7, %zmm13 {%k3} 9001; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] 9002; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm5, %ymm8 9003; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u] 9004; AVX512DQ-BW-NEXT: vmovdqu16 %ymm10, %ymm3 {%k1} 9005; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm15 9006; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm15, %xmm10 9007; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm16 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] 9008; AVX512DQ-BW-NEXT: vpshufb %xmm16, %xmm3, %xmm17 9009; AVX512DQ-BW-NEXT: vporq %xmm10, %xmm17, %xmm10 9010; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm8[5,6,7] 9011; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7] 9012; AVX512DQ-BW-NEXT: vmovdqu16 %ymm23, %ymm0 {%k1} 9013; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm8 9014; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm8, %xmm14 9015; AVX512DQ-BW-NEXT: vpshufb %xmm16, %xmm0, %xmm16 9016; AVX512DQ-BW-NEXT: vporq %xmm14, %xmm16, %xmm14 9017; AVX512DQ-BW-NEXT: vmovdqu16 %ymm26, %ymm1 {%k2} 9018; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm1, %xmm16 9019; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] 9020; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm16, %xmm18 9021; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm19 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] 9022; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm1, %xmm20 9023; AVX512DQ-BW-NEXT: vporq %xmm18, %xmm20, %xmm18 9024; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 9025; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm14, %zmm18, %zmm14 9026; AVX512DQ-BW-NEXT: movabsq $4398044413952, %rdi # imm = 0x3FFFFE00000 9027; AVX512DQ-BW-NEXT: kmovq %rdi, %k1 9028; AVX512DQ-BW-NEXT: vmovdqu8 %zmm14, %zmm10 {%k1} 9029; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm4, %ymm7 9030; AVX512DQ-BW-NEXT: vmovdqu16 %ymm11, %ymm6 {%k2} 9031; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm11 9032; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm11, %xmm14 9033; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm6, %xmm17 9034; AVX512DQ-BW-NEXT: vporq %xmm14, %xmm17, %xmm14 9035; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 9036; AVX512DQ-BW-NEXT: movl $-2097152, %edi # imm = 0xFFE00000 9037; AVX512DQ-BW-NEXT: kmovd %edi, %k2 9038; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm7 {%k2} 9039; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 9040; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm10 {%k2} 9041; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15] 9042; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm5, %ymm5 9043; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] 9044; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm15, %xmm15 9045; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u] 9046; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm3, %xmm3 9047; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm15, %xmm3 9048; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] 9049; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] 9050; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm8, %xmm5 9051; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm0, %xmm0 9052; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm0, %xmm0 9053; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15] 9054; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm16, %xmm8 9055; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128] 9056; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 9057; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm8, %xmm1 9058; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 9059; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 9060; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1} 9061; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm4, %ymm0 9062; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm11, %xmm1 9063; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm6, %xmm4 9064; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm4, %xmm1 9065; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 9066; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} 9067; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 9068; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k2} 9069; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rsi) 9070; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rdx) 9071; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%rcx) 9072; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%r8) 9073; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%r9) 9074; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) 9075; AVX512DQ-BW-NEXT: vzeroupper 9076; AVX512DQ-BW-NEXT: retq 9077; 9078; AVX512DQ-BW-FCP-LABEL: load_i8_stride6_vf64: 9079; AVX512DQ-BW-FCP: # %bb.0: 9080; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax 9081; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm0 9082; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm23 9083; AVX512DQ-BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 9084; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 9085; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm23, %ymm9 {%k1} 9086; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] 9087; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm1 9088; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] 9089; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm12 9090; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm12, %xmm3 9091; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm3, %xmm5 9092; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm10 9093; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 9094; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 9095; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm26 9096; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 9097; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm26, %ymm1, %ymm15 {%k1} 9098; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 9099; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10] 9100; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm16, %xmm11 9101; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm18 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128] 9102; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm18, %xmm15, %xmm13 9103; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm13, %xmm11 9104; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 9105; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm11, %zmm11 9106; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],mem[2,3] 9107; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm6, %ymm13 9108; AVX512DQ-BW-FCP-NEXT: movw $-28124, %r10w # imm = 0x9224 9109; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k4 9110; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm13, %ymm19 {%k4} 9111; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm20 {%k1} 9112; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm20, %xmm2 9113; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm21 9114; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm21, %xmm4 9115; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2 9116; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10] 9117; AVX512DQ-BW-FCP-NEXT: movl $4192256, %r10d # imm = 0x3FF800 9118; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 9119; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm19, %ymm2 {%k2} 9120; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm2 {%k2} 9121; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm11 9122; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm11[2,3],mem[2,3] 9123; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm11, %ymm14 9124; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm14, %ymm22 {%k4} 9125; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm22, %ymm7 9126; AVX512DQ-BW-FCP-NEXT: vmovdqa 320(%rdi), %ymm11 9127; AVX512DQ-BW-FCP-NEXT: vmovdqa 352(%rdi), %ymm6 9128; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm6, %ymm24 {%k1} 9129; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm24, %xmm25 9130; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm25, %xmm17 9131; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm18, %xmm24, %xmm18 9132; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm17 9133; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm8 9134; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15] 9135; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] 9136; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 9137; AVX512DQ-BW-FCP-NEXT: movabsq $-8796093022208, %rdi # imm = 0xFFFFF80000000000 9138; AVX512DQ-BW-FCP-NEXT: kmovq %rdi, %k3 9139; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm2 {%k3} 9140; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u] 9141; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm8 9142; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u] 9143; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm12, %xmm12 9144; AVX512DQ-BW-FCP-NEXT: vpor %xmm8, %xmm12, %xmm8 9145; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11] 9146; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm16, %xmm16 9147; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128] 9148; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm15, %xmm15 9149; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 9150; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 9151; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm15, %zmm8 9152; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm20, %xmm7 9153; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm21, %xmm9 9154; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm9, %xmm9 9155; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11] 9156; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm19, %ymm9 {%k2} 9157; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} 9158; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm22, %ymm7 9159; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm25, %xmm8 9160; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm24, %xmm12 9161; AVX512DQ-BW-FCP-NEXT: vpor %xmm8, %xmm12, %xmm8 9162; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 9163; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15] 9164; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] 9165; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 9166; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm9 {%k3} 9167; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm13, %ymm5, %ymm15 {%k4} 9168; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u] 9169; AVX512DQ-BW-FCP-NEXT: movw $9362, %di # imm = 0x2492 9170; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 9171; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm8 {%k2} 9172; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm8, %xmm16 9173; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm16, %xmm12 9174; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u] 9175; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm8, %xmm18 9176; AVX512DQ-BW-FCP-NEXT: vporq %xmm12, %xmm18, %xmm18 9177; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm19 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] 9178; AVX512DQ-BW-FCP-NEXT: movl $2095104, %edi # imm = 0x1FF800 9179; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k5 9180; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm15, %ymm18 {%k5} 9181; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm23, %ymm0, %ymm20 {%k2} 9182; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm21 9183; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm21, %xmm7 9184; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm20, %xmm12 9185; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm12, %xmm7 9186; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm26, %ymm17 {%k1} 9187; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm22 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128] 9188; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm22, %xmm17, %xmm12 9189; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm24 9190; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm25 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12] 9191; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm25, %xmm24, %xmm27 9192; AVX512DQ-BW-FCP-NEXT: vporq %xmm12, %xmm27, %xmm12 9193; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 9194; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm12, %zmm12 9195; AVX512DQ-BW-FCP-NEXT: movl $2097151, %edi # imm = 0x1FFFFF 9196; AVX512DQ-BW-FCP-NEXT: kmovq %rdi, %k6 9197; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm18, %zmm12 {%k6} 9198; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm14, %ymm4, %ymm7 {%k4} 9199; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm11, %ymm18 {%k1} 9200; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm22, %xmm18, %xmm22 9201; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm27 9202; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm25, %xmm27, %xmm25 9203; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm25, %xmm22 9204; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 9205; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm7, %ymm22 {%k5} 9206; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm19 9207; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm12 {%k3} 9208; AVX512DQ-BW-FCP-NEXT: movw $9289, %di # imm = 0x2449 9209; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k4 9210; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm14, %ymm4 {%k4} 9211; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm13, %ymm5 {%k4} 9212; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u] 9213; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm16, %xmm14 9214; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm16 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u] 9215; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm8, %xmm8 9216; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm8, %xmm8 9217; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm14 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13] 9218; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm8 {%k5} 9219; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm21, %xmm13 9220; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm20, %xmm15 9221; AVX512DQ-BW-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13 9222; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128] 9223; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm17, %xmm16 9224; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13] 9225; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm24, %xmm19 9226; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm19, %xmm16 9227; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 9228; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm16, %zmm13 9229; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm8, %zmm13 {%k6} 9230; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm18, %xmm8 9231; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm27, %xmm15 9232; AVX512DQ-BW-FCP-NEXT: vpor %xmm8, %xmm15, %xmm8 9233; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 9234; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm8 {%k5} 9235; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm7 9236; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm13 {%k3} 9237; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] 9238; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm8 9239; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u] 9240; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm3 {%k1} 9241; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm15 9242; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm15, %xmm10 9243; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm16 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u] 9244; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm3, %xmm17 9245; AVX512DQ-BW-FCP-NEXT: vporq %xmm10, %xmm17, %xmm10 9246; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm8[5,6,7] 9247; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7] 9248; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm23, %ymm0 {%k1} 9249; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm8 9250; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm14 9251; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm0, %xmm16 9252; AVX512DQ-BW-FCP-NEXT: vporq %xmm14, %xmm16, %xmm14 9253; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm26, %ymm1 {%k2} 9254; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm1, %xmm16 9255; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14] 9256; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm16, %xmm18 9257; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128] 9258; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm1, %xmm20 9259; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm20, %xmm18 9260; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 9261; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm18, %zmm14 9262; AVX512DQ-BW-FCP-NEXT: movabsq $4398044413952, %rdi # imm = 0x3FFFFE00000 9263; AVX512DQ-BW-FCP-NEXT: kmovq %rdi, %k1 9264; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm10 {%k1} 9265; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm7 9266; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm11, %ymm6 {%k2} 9267; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm11 9268; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm11, %xmm14 9269; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm6, %xmm17 9270; AVX512DQ-BW-FCP-NEXT: vporq %xmm14, %xmm17, %xmm14 9271; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 9272; AVX512DQ-BW-FCP-NEXT: movl $-2097152, %edi # imm = 0xFFE00000 9273; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 9274; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm7 {%k2} 9275; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 9276; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm10 {%k2} 9277; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15] 9278; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5 9279; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u] 9280; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm15, %xmm15 9281; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u] 9282; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm3, %xmm3 9283; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm15, %xmm3 9284; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] 9285; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] 9286; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm5 9287; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm0, %xmm0 9288; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0 9289; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15] 9290; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm16, %xmm8 9291; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128] 9292; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 9293; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm8, %xmm1 9294; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 9295; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 9296; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1} 9297; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm0 9298; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm1 9299; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm4 9300; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm4, %xmm1 9301; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 9302; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} 9303; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 9304; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k2} 9305; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi) 9306; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdx) 9307; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx) 9308; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%r8) 9309; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%r9) 9310; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) 9311; AVX512DQ-BW-FCP-NEXT: vzeroupper 9312; AVX512DQ-BW-FCP-NEXT: retq 9313 %wide.vec = load <384 x i8>, ptr %in.vec, align 64 9314 %strided.vec0 = shufflevector <384 x i8> %wide.vec, <384 x i8> poison, <64 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90, i32 96, i32 102, i32 108, i32 114, i32 120, i32 126, i32 132, i32 138, i32 144, i32 150, i32 156, i32 162, i32 168, i32 174, i32 180, i32 186, i32 192, i32 198, i32 204, i32 210, i32 216, i32 222, i32 228, i32 234, i32 240, i32 246, i32 252, i32 258, i32 264, i32 270, i32 276, i32 282, i32 288, i32 294, i32 300, i32 306, i32 312, i32 318, i32 324, i32 330, i32 336, i32 342, i32 348, i32 354, i32 360, i32 366, i32 372, i32 378> 9315 %strided.vec1 = shufflevector <384 x i8> %wide.vec, <384 x i8> poison, <64 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43, i32 49, i32 55, i32 61, i32 67, i32 73, i32 79, i32 85, i32 91, i32 97, i32 103, i32 109, i32 115, i32 121, i32 127, i32 133, i32 139, i32 145, i32 151, i32 157, i32 163, i32 169, i32 175, i32 181, i32 187, i32 193, i32 199, i32 205, i32 211, i32 217, i32 223, i32 229, i32 235, i32 241, i32 247, i32 253, i32 259, i32 265, i32 271, i32 277, i32 283, i32 289, i32 295, i32 301, i32 307, i32 313, i32 319, i32 325, i32 331, i32 337, i32 343, i32 349, i32 355, i32 361, i32 367, i32 373, i32 379> 9316 %strided.vec2 = shufflevector <384 x i8> %wide.vec, <384 x i8> poison, <64 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44, i32 50, i32 56, i32 62, i32 68, i32 74, i32 80, i32 86, i32 92, i32 98, i32 104, i32 110, i32 116, i32 122, i32 128, i32 134, i32 140, i32 146, i32 152, i32 158, i32 164, i32 170, i32 176, i32 182, i32 188, i32 194, i32 200, i32 206, i32 212, i32 218, i32 224, i32 230, i32 236, i32 242, i32 248, i32 254, i32 260, i32 266, i32 272, i32 278, i32 284, i32 290, i32 296, i32 302, i32 308, i32 314, i32 320, i32 326, i32 332, i32 338, i32 344, i32 350, i32 356, i32 362, i32 368, i32 374, i32 380> 9317 %strided.vec3 = shufflevector <384 x i8> %wide.vec, <384 x i8> poison, <64 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45, i32 51, i32 57, i32 63, i32 69, i32 75, i32 81, i32 87, i32 93, i32 99, i32 105, i32 111, i32 117, i32 123, i32 129, i32 135, i32 141, i32 147, i32 153, i32 159, i32 165, i32 171, i32 177, i32 183, i32 189, i32 195, i32 201, i32 207, i32 213, i32 219, i32 225, i32 231, i32 237, i32 243, i32 249, i32 255, i32 261, i32 267, i32 273, i32 279, i32 285, i32 291, i32 297, i32 303, i32 309, i32 315, i32 321, i32 327, i32 333, i32 339, i32 345, i32 351, i32 357, i32 363, i32 369, i32 375, i32 381> 9318 %strided.vec4 = shufflevector <384 x i8> %wide.vec, <384 x i8> poison, <64 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46, i32 52, i32 58, i32 64, i32 70, i32 76, i32 82, i32 88, i32 94, i32 100, i32 106, i32 112, i32 118, i32 124, i32 130, i32 136, i32 142, i32 148, i32 154, i32 160, i32 166, i32 172, i32 178, i32 184, i32 190, i32 196, i32 202, i32 208, i32 214, i32 220, i32 226, i32 232, i32 238, i32 244, i32 250, i32 256, i32 262, i32 268, i32 274, i32 280, i32 286, i32 292, i32 298, i32 304, i32 310, i32 316, i32 322, i32 328, i32 334, i32 340, i32 346, i32 352, i32 358, i32 364, i32 370, i32 376, i32 382> 9319 %strided.vec5 = shufflevector <384 x i8> %wide.vec, <384 x i8> poison, <64 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47, i32 53, i32 59, i32 65, i32 71, i32 77, i32 83, i32 89, i32 95, i32 101, i32 107, i32 113, i32 119, i32 125, i32 131, i32 137, i32 143, i32 149, i32 155, i32 161, i32 167, i32 173, i32 179, i32 185, i32 191, i32 197, i32 203, i32 209, i32 215, i32 221, i32 227, i32 233, i32 239, i32 245, i32 251, i32 257, i32 263, i32 269, i32 275, i32 281, i32 287, i32 293, i32 299, i32 305, i32 311, i32 317, i32 323, i32 329, i32 335, i32 341, i32 347, i32 353, i32 359, i32 365, i32 371, i32 377, i32 383> 9320 store <64 x i8> %strided.vec0, ptr %out.vec0, align 64 9321 store <64 x i8> %strided.vec1, ptr %out.vec1, align 64 9322 store <64 x i8> %strided.vec2, ptr %out.vec2, align 64 9323 store <64 x i8> %strided.vec3, ptr %out.vec3, align 64 9324 store <64 x i8> %strided.vec4, ptr %out.vec4, align 64 9325 store <64 x i8> %strided.vec5, ptr %out.vec5, align 64 9326 ret void 9327} 9328