xref: /llvm-project/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll (revision 8ac00ca4867835cacaf013f5c442658b9b1bce38)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx  | FileCheck %s --check-prefixes=AVX
4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
15
16; These patterns are produced by LoopVectorizer for interleaved loads.
17
18define void @load_i8_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
19; SSE-LABEL: load_i8_stride6_vf2:
20; SSE:       # %bb.0:
21; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
22; SSE-NEXT:    movdqa (%rdi), %xmm1
23; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
24; SSE-NEXT:    pand %xmm1, %xmm3
25; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm3[0,3,2,3,4,5,6,7]
26; SSE-NEXT:    packuswb %xmm2, %xmm2
27; SSE-NEXT:    pxor %xmm4, %xmm4
28; SSE-NEXT:    movdqa %xmm1, %xmm0
29; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
30; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,3,2,3]
31; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
32; SSE-NEXT:    packuswb %xmm5, %xmm5
33; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,2,2,3]
34; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[1,2,2,3,4,5,6,7]
35; SSE-NEXT:    packuswb %xmm6, %xmm6
36; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
37; SSE-NEXT:    movdqa %xmm0, %xmm4
38; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
39; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
40; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
41; SSE-NEXT:    packuswb %xmm4, %xmm4
42; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
43; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
44; SSE-NEXT:    packuswb %xmm3, %xmm3
45; SSE-NEXT:    psrlq $48, %xmm1
46; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
47; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
48; SSE-NEXT:    packuswb %xmm0, %xmm0
49; SSE-NEXT:    movd %xmm2, %edi
50; SSE-NEXT:    movw %di, (%rsi)
51; SSE-NEXT:    movd %xmm5, %esi
52; SSE-NEXT:    movw %si, (%rdx)
53; SSE-NEXT:    movd %xmm6, %edx
54; SSE-NEXT:    movw %dx, (%rcx)
55; SSE-NEXT:    movd %xmm4, %ecx
56; SSE-NEXT:    movw %cx, (%r8)
57; SSE-NEXT:    movd %xmm3, %ecx
58; SSE-NEXT:    movw %cx, (%r9)
59; SSE-NEXT:    movd %xmm0, %ecx
60; SSE-NEXT:    movw %cx, (%rax)
61; SSE-NEXT:    retq
62;
63; AVX-LABEL: load_i8_stride6_vf2:
64; AVX:       # %bb.0:
65; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
66; AVX-NEXT:    vmovdqa (%rdi), %xmm0
67; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
68; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
69; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
70; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
71; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
72; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
73; AVX-NEXT:    vpextrw $0, %xmm1, (%rsi)
74; AVX-NEXT:    vpextrw $0, %xmm2, (%rdx)
75; AVX-NEXT:    vpextrw $0, %xmm3, (%rcx)
76; AVX-NEXT:    vpextrw $0, %xmm4, (%r8)
77; AVX-NEXT:    vpextrw $0, %xmm5, (%r9)
78; AVX-NEXT:    vpextrw $0, %xmm0, (%rax)
79; AVX-NEXT:    retq
80;
81; AVX2-LABEL: load_i8_stride6_vf2:
82; AVX2:       # %bb.0:
83; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
84; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
85; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
86; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
87; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
88; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
89; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
90; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
91; AVX2-NEXT:    vpextrw $0, %xmm1, (%rsi)
92; AVX2-NEXT:    vpextrw $0, %xmm2, (%rdx)
93; AVX2-NEXT:    vpextrw $0, %xmm3, (%rcx)
94; AVX2-NEXT:    vpextrw $0, %xmm4, (%r8)
95; AVX2-NEXT:    vpextrw $0, %xmm5, (%r9)
96; AVX2-NEXT:    vpextrw $0, %xmm0, (%rax)
97; AVX2-NEXT:    retq
98;
99; AVX2-FP-LABEL: load_i8_stride6_vf2:
100; AVX2-FP:       # %bb.0:
101; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
102; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
103; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
104; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
105; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
106; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
107; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
108; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
109; AVX2-FP-NEXT:    vpextrw $0, %xmm1, (%rsi)
110; AVX2-FP-NEXT:    vpextrw $0, %xmm2, (%rdx)
111; AVX2-FP-NEXT:    vpextrw $0, %xmm3, (%rcx)
112; AVX2-FP-NEXT:    vpextrw $0, %xmm4, (%r8)
113; AVX2-FP-NEXT:    vpextrw $0, %xmm5, (%r9)
114; AVX2-FP-NEXT:    vpextrw $0, %xmm0, (%rax)
115; AVX2-FP-NEXT:    retq
116;
117; AVX2-FCP-LABEL: load_i8_stride6_vf2:
118; AVX2-FCP:       # %bb.0:
119; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
120; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm0
121; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
122; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
123; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
124; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
125; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
126; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
127; AVX2-FCP-NEXT:    vpextrw $0, %xmm1, (%rsi)
128; AVX2-FCP-NEXT:    vpextrw $0, %xmm2, (%rdx)
129; AVX2-FCP-NEXT:    vpextrw $0, %xmm3, (%rcx)
130; AVX2-FCP-NEXT:    vpextrw $0, %xmm4, (%r8)
131; AVX2-FCP-NEXT:    vpextrw $0, %xmm5, (%r9)
132; AVX2-FCP-NEXT:    vpextrw $0, %xmm0, (%rax)
133; AVX2-FCP-NEXT:    retq
134;
135; AVX512-LABEL: load_i8_stride6_vf2:
136; AVX512:       # %bb.0:
137; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
138; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
139; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
140; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
141; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
142; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
143; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
144; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
145; AVX512-NEXT:    vpextrw $0, %xmm1, (%rsi)
146; AVX512-NEXT:    vpextrw $0, %xmm2, (%rdx)
147; AVX512-NEXT:    vpextrw $0, %xmm3, (%rcx)
148; AVX512-NEXT:    vpextrw $0, %xmm4, (%r8)
149; AVX512-NEXT:    vpextrw $0, %xmm5, (%r9)
150; AVX512-NEXT:    vpextrw $0, %xmm0, (%rax)
151; AVX512-NEXT:    retq
152;
153; AVX512-FCP-LABEL: load_i8_stride6_vf2:
154; AVX512-FCP:       # %bb.0:
155; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
156; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
157; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
158; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
159; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
160; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
161; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
162; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
163; AVX512-FCP-NEXT:    vpextrw $0, %xmm1, (%rsi)
164; AVX512-FCP-NEXT:    vpextrw $0, %xmm2, (%rdx)
165; AVX512-FCP-NEXT:    vpextrw $0, %xmm3, (%rcx)
166; AVX512-FCP-NEXT:    vpextrw $0, %xmm4, (%r8)
167; AVX512-FCP-NEXT:    vpextrw $0, %xmm5, (%r9)
168; AVX512-FCP-NEXT:    vpextrw $0, %xmm0, (%rax)
169; AVX512-FCP-NEXT:    retq
170;
171; AVX512DQ-LABEL: load_i8_stride6_vf2:
172; AVX512DQ:       # %bb.0:
173; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
174; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
175; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
176; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
177; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
178; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
179; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
180; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
181; AVX512DQ-NEXT:    vpextrw $0, %xmm1, (%rsi)
182; AVX512DQ-NEXT:    vpextrw $0, %xmm2, (%rdx)
183; AVX512DQ-NEXT:    vpextrw $0, %xmm3, (%rcx)
184; AVX512DQ-NEXT:    vpextrw $0, %xmm4, (%r8)
185; AVX512DQ-NEXT:    vpextrw $0, %xmm5, (%r9)
186; AVX512DQ-NEXT:    vpextrw $0, %xmm0, (%rax)
187; AVX512DQ-NEXT:    retq
188;
189; AVX512DQ-FCP-LABEL: load_i8_stride6_vf2:
190; AVX512DQ-FCP:       # %bb.0:
191; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
192; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
193; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
194; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
195; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
196; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
197; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
198; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
199; AVX512DQ-FCP-NEXT:    vpextrw $0, %xmm1, (%rsi)
200; AVX512DQ-FCP-NEXT:    vpextrw $0, %xmm2, (%rdx)
201; AVX512DQ-FCP-NEXT:    vpextrw $0, %xmm3, (%rcx)
202; AVX512DQ-FCP-NEXT:    vpextrw $0, %xmm4, (%r8)
203; AVX512DQ-FCP-NEXT:    vpextrw $0, %xmm5, (%r9)
204; AVX512DQ-FCP-NEXT:    vpextrw $0, %xmm0, (%rax)
205; AVX512DQ-FCP-NEXT:    retq
206;
207; AVX512BW-LABEL: load_i8_stride6_vf2:
208; AVX512BW:       # %bb.0:
209; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
210; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
211; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
212; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
213; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
214; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
215; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
216; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
217; AVX512BW-NEXT:    vpextrw $0, %xmm1, (%rsi)
218; AVX512BW-NEXT:    vpextrw $0, %xmm2, (%rdx)
219; AVX512BW-NEXT:    vpextrw $0, %xmm3, (%rcx)
220; AVX512BW-NEXT:    vpextrw $0, %xmm4, (%r8)
221; AVX512BW-NEXT:    vpextrw $0, %xmm5, (%r9)
222; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rax)
223; AVX512BW-NEXT:    retq
224;
225; AVX512BW-FCP-LABEL: load_i8_stride6_vf2:
226; AVX512BW-FCP:       # %bb.0:
227; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
228; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
229; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
230; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
231; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
232; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
233; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
234; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
235; AVX512BW-FCP-NEXT:    vpextrw $0, %xmm1, (%rsi)
236; AVX512BW-FCP-NEXT:    vpextrw $0, %xmm2, (%rdx)
237; AVX512BW-FCP-NEXT:    vpextrw $0, %xmm3, (%rcx)
238; AVX512BW-FCP-NEXT:    vpextrw $0, %xmm4, (%r8)
239; AVX512BW-FCP-NEXT:    vpextrw $0, %xmm5, (%r9)
240; AVX512BW-FCP-NEXT:    vpextrw $0, %xmm0, (%rax)
241; AVX512BW-FCP-NEXT:    retq
242;
243; AVX512DQ-BW-LABEL: load_i8_stride6_vf2:
244; AVX512DQ-BW:       # %bb.0:
245; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
246; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
247; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
248; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
249; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
250; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
251; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
252; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
253; AVX512DQ-BW-NEXT:    vpextrw $0, %xmm1, (%rsi)
254; AVX512DQ-BW-NEXT:    vpextrw $0, %xmm2, (%rdx)
255; AVX512DQ-BW-NEXT:    vpextrw $0, %xmm3, (%rcx)
256; AVX512DQ-BW-NEXT:    vpextrw $0, %xmm4, (%r8)
257; AVX512DQ-BW-NEXT:    vpextrw $0, %xmm5, (%r9)
258; AVX512DQ-BW-NEXT:    vpextrw $0, %xmm0, (%rax)
259; AVX512DQ-BW-NEXT:    retq
260;
261; AVX512DQ-BW-FCP-LABEL: load_i8_stride6_vf2:
262; AVX512DQ-BW-FCP:       # %bb.0:
263; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
264; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
265; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
266; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
267; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
268; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
269; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
270; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
271; AVX512DQ-BW-FCP-NEXT:    vpextrw $0, %xmm1, (%rsi)
272; AVX512DQ-BW-FCP-NEXT:    vpextrw $0, %xmm2, (%rdx)
273; AVX512DQ-BW-FCP-NEXT:    vpextrw $0, %xmm3, (%rcx)
274; AVX512DQ-BW-FCP-NEXT:    vpextrw $0, %xmm4, (%r8)
275; AVX512DQ-BW-FCP-NEXT:    vpextrw $0, %xmm5, (%r9)
276; AVX512DQ-BW-FCP-NEXT:    vpextrw $0, %xmm0, (%rax)
277; AVX512DQ-BW-FCP-NEXT:    retq
278  %wide.vec = load <12 x i8>, ptr %in.vec, align 64
279  %strided.vec0 = shufflevector <12 x i8> %wide.vec, <12 x i8> poison, <2 x i32> <i32 0, i32 6>
280  %strided.vec1 = shufflevector <12 x i8> %wide.vec, <12 x i8> poison, <2 x i32> <i32 1, i32 7>
281  %strided.vec2 = shufflevector <12 x i8> %wide.vec, <12 x i8> poison, <2 x i32> <i32 2, i32 8>
282  %strided.vec3 = shufflevector <12 x i8> %wide.vec, <12 x i8> poison, <2 x i32> <i32 3, i32 9>
283  %strided.vec4 = shufflevector <12 x i8> %wide.vec, <12 x i8> poison, <2 x i32> <i32 4, i32 10>
284  %strided.vec5 = shufflevector <12 x i8> %wide.vec, <12 x i8> poison, <2 x i32> <i32 5, i32 11>
285  store <2 x i8> %strided.vec0, ptr %out.vec0, align 64
286  store <2 x i8> %strided.vec1, ptr %out.vec1, align 64
287  store <2 x i8> %strided.vec2, ptr %out.vec2, align 64
288  store <2 x i8> %strided.vec3, ptr %out.vec3, align 64
289  store <2 x i8> %strided.vec4, ptr %out.vec4, align 64
290  store <2 x i8> %strided.vec5, ptr %out.vec5, align 64
291  ret void
292}
293
294define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
295; SSE-LABEL: load_i8_stride6_vf4:
296; SSE:       # %bb.0:
297; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
298; SSE-NEXT:    movdqa (%rdi), %xmm5
299; SSE-NEXT:    movdqa 16(%rdi), %xmm1
300; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535]
301; SSE-NEXT:    movdqa %xmm5, %xmm2
302; SSE-NEXT:    pand %xmm0, %xmm2
303; SSE-NEXT:    pandn %xmm1, %xmm0
304; SSE-NEXT:    por %xmm2, %xmm0
305; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
306; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [16711935,16711935,16711935,16711935]
307; SSE-NEXT:    pand %xmm2, %xmm0
308; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
309; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
310; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
311; SSE-NEXT:    packuswb %xmm0, %xmm0
312; SSE-NEXT:    pxor %xmm3, %xmm3
313; SSE-NEXT:    movdqa %xmm5, %xmm7
314; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
315; SSE-NEXT:    pandn %xmm1, %xmm4
316; SSE-NEXT:    movdqa %xmm1, %xmm6
317; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[1,0],xmm5[0,0]
318; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3]
319; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,0,65535,65535]
320; SSE-NEXT:    pand %xmm8, %xmm1
321; SSE-NEXT:    pandn %xmm5, %xmm8
322; SSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
323; SSE-NEXT:    movdqa %xmm5, %xmm9
324; SSE-NEXT:    psrld $16, %xmm9
325; SSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
326; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
327; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7]
328; SSE-NEXT:    punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3]
329; SSE-NEXT:    packuswb %xmm7, %xmm7
330; SSE-NEXT:    por %xmm7, %xmm4
331; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm6[2,1,2,3,4,5,6,7]
332; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
333; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7
334; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
335; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7]
336; SSE-NEXT:    packuswb %xmm7, %xmm7
337; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
338; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[1,2],xmm5[0,3]
339; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,0,1,3]
340; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
341; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7]
342; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
343; SSE-NEXT:    packuswb %xmm6, %xmm6
344; SSE-NEXT:    por %xmm1, %xmm8
345; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[3,1,2,0]
346; SSE-NEXT:    pand %xmm2, %xmm1
347; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
348; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
349; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7]
350; SSE-NEXT:    packuswb %xmm1, %xmm1
351; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7]
352; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,0],xmm8[0,0]
353; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[2,3]
354; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm5[3,1,2,3,4,5,6,7]
355; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
356; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
357; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7]
358; SSE-NEXT:    packuswb %xmm2, %xmm2
359; SSE-NEXT:    movd %xmm0, (%rsi)
360; SSE-NEXT:    movd %xmm4, (%rdx)
361; SSE-NEXT:    movd %xmm7, (%rcx)
362; SSE-NEXT:    movd %xmm6, (%r8)
363; SSE-NEXT:    movd %xmm1, (%r9)
364; SSE-NEXT:    movd %xmm2, (%rax)
365; SSE-NEXT:    retq
366;
367; AVX-LABEL: load_i8_stride6_vf4:
368; AVX:       # %bb.0:
369; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
370; AVX-NEXT:    vmovdqa (%rdi), %xmm0
371; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
372; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u]
373; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
374; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
375; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u]
376; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
377; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
378; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u]
379; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
380; AVX-NEXT:    vpor %xmm4, %xmm5, %xmm4
381; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
382; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
383; AVX-NEXT:    vpor %xmm5, %xmm6, %xmm5
384; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
385; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
386; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
387; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
388; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
389; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
390; AVX-NEXT:    vmovd %xmm2, (%rsi)
391; AVX-NEXT:    vmovd %xmm3, (%rdx)
392; AVX-NEXT:    vmovd %xmm4, (%rcx)
393; AVX-NEXT:    vmovd %xmm5, (%r8)
394; AVX-NEXT:    vmovd %xmm6, (%r9)
395; AVX-NEXT:    vmovd %xmm0, (%rax)
396; AVX-NEXT:    retq
397;
398; AVX2-LABEL: load_i8_stride6_vf4:
399; AVX2:       # %bb.0:
400; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
401; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
402; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
403; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u]
404; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
405; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
406; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u]
407; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
408; AVX2-NEXT:    vpor %xmm3, %xmm4, %xmm3
409; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u]
410; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
411; AVX2-NEXT:    vpor %xmm4, %xmm5, %xmm4
412; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
413; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
414; AVX2-NEXT:    vpor %xmm5, %xmm6, %xmm5
415; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
416; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
417; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
418; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
419; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
420; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
421; AVX2-NEXT:    vmovd %xmm2, (%rsi)
422; AVX2-NEXT:    vmovd %xmm3, (%rdx)
423; AVX2-NEXT:    vmovd %xmm4, (%rcx)
424; AVX2-NEXT:    vmovd %xmm5, (%r8)
425; AVX2-NEXT:    vmovd %xmm6, (%r9)
426; AVX2-NEXT:    vmovd %xmm0, (%rax)
427; AVX2-NEXT:    retq
428;
429; AVX2-FP-LABEL: load_i8_stride6_vf4:
430; AVX2-FP:       # %bb.0:
431; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
432; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
433; AVX2-FP-NEXT:    vmovdqa 16(%rdi), %xmm1
434; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u]
435; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
436; AVX2-FP-NEXT:    vpor %xmm2, %xmm3, %xmm2
437; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u]
438; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
439; AVX2-FP-NEXT:    vpor %xmm3, %xmm4, %xmm3
440; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u]
441; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
442; AVX2-FP-NEXT:    vpor %xmm4, %xmm5, %xmm4
443; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
444; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
445; AVX2-FP-NEXT:    vpor %xmm5, %xmm6, %xmm5
446; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
447; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
448; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
449; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
450; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
451; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
452; AVX2-FP-NEXT:    vmovd %xmm2, (%rsi)
453; AVX2-FP-NEXT:    vmovd %xmm3, (%rdx)
454; AVX2-FP-NEXT:    vmovd %xmm4, (%rcx)
455; AVX2-FP-NEXT:    vmovd %xmm5, (%r8)
456; AVX2-FP-NEXT:    vmovd %xmm6, (%r9)
457; AVX2-FP-NEXT:    vmovd %xmm0, (%rax)
458; AVX2-FP-NEXT:    retq
459;
460; AVX2-FCP-LABEL: load_i8_stride6_vf4:
461; AVX2-FCP:       # %bb.0:
462; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
463; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm0
464; AVX2-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
465; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u]
466; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
467; AVX2-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
468; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u]
469; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
470; AVX2-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
471; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u]
472; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
473; AVX2-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
474; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
475; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
476; AVX2-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
477; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
478; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
479; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
480; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
481; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
482; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
483; AVX2-FCP-NEXT:    vmovd %xmm2, (%rsi)
484; AVX2-FCP-NEXT:    vmovd %xmm3, (%rdx)
485; AVX2-FCP-NEXT:    vmovd %xmm4, (%rcx)
486; AVX2-FCP-NEXT:    vmovd %xmm5, (%r8)
487; AVX2-FCP-NEXT:    vmovd %xmm6, (%r9)
488; AVX2-FCP-NEXT:    vmovd %xmm0, (%rax)
489; AVX2-FCP-NEXT:    retq
490;
491; AVX512-LABEL: load_i8_stride6_vf4:
492; AVX512:       # %bb.0:
493; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
494; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
495; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
496; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u]
497; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
498; AVX512-NEXT:    vpor %xmm2, %xmm3, %xmm2
499; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u]
500; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
501; AVX512-NEXT:    vpor %xmm3, %xmm4, %xmm3
502; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u]
503; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
504; AVX512-NEXT:    vpor %xmm4, %xmm5, %xmm4
505; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
506; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
507; AVX512-NEXT:    vpor %xmm5, %xmm6, %xmm5
508; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
509; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
510; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
511; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
512; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
513; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
514; AVX512-NEXT:    vmovd %xmm2, (%rsi)
515; AVX512-NEXT:    vmovd %xmm3, (%rdx)
516; AVX512-NEXT:    vmovd %xmm4, (%rcx)
517; AVX512-NEXT:    vmovd %xmm5, (%r8)
518; AVX512-NEXT:    vmovd %xmm6, (%r9)
519; AVX512-NEXT:    vmovd %xmm0, (%rax)
520; AVX512-NEXT:    retq
521;
522; AVX512-FCP-LABEL: load_i8_stride6_vf4:
523; AVX512-FCP:       # %bb.0:
524; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
525; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
526; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
527; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u]
528; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
529; AVX512-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
530; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u]
531; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
532; AVX512-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
533; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u]
534; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
535; AVX512-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
536; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
537; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
538; AVX512-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
539; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
540; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
541; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
542; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
543; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
544; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
545; AVX512-FCP-NEXT:    vmovd %xmm2, (%rsi)
546; AVX512-FCP-NEXT:    vmovd %xmm3, (%rdx)
547; AVX512-FCP-NEXT:    vmovd %xmm4, (%rcx)
548; AVX512-FCP-NEXT:    vmovd %xmm5, (%r8)
549; AVX512-FCP-NEXT:    vmovd %xmm6, (%r9)
550; AVX512-FCP-NEXT:    vmovd %xmm0, (%rax)
551; AVX512-FCP-NEXT:    retq
552;
553; AVX512DQ-LABEL: load_i8_stride6_vf4:
554; AVX512DQ:       # %bb.0:
555; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
556; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
557; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm1
558; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u]
559; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
560; AVX512DQ-NEXT:    vpor %xmm2, %xmm3, %xmm2
561; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u]
562; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
563; AVX512DQ-NEXT:    vpor %xmm3, %xmm4, %xmm3
564; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u]
565; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
566; AVX512DQ-NEXT:    vpor %xmm4, %xmm5, %xmm4
567; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
568; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
569; AVX512DQ-NEXT:    vpor %xmm5, %xmm6, %xmm5
570; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
571; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
572; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
573; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
574; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
575; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
576; AVX512DQ-NEXT:    vmovd %xmm2, (%rsi)
577; AVX512DQ-NEXT:    vmovd %xmm3, (%rdx)
578; AVX512DQ-NEXT:    vmovd %xmm4, (%rcx)
579; AVX512DQ-NEXT:    vmovd %xmm5, (%r8)
580; AVX512DQ-NEXT:    vmovd %xmm6, (%r9)
581; AVX512DQ-NEXT:    vmovd %xmm0, (%rax)
582; AVX512DQ-NEXT:    retq
583;
584; AVX512DQ-FCP-LABEL: load_i8_stride6_vf4:
585; AVX512DQ-FCP:       # %bb.0:
586; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
587; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
588; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
589; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u]
590; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
591; AVX512DQ-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
592; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u]
593; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
594; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
595; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u]
596; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
597; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
598; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
599; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
600; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
601; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
602; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
603; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
604; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
605; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
606; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
607; AVX512DQ-FCP-NEXT:    vmovd %xmm2, (%rsi)
608; AVX512DQ-FCP-NEXT:    vmovd %xmm3, (%rdx)
609; AVX512DQ-FCP-NEXT:    vmovd %xmm4, (%rcx)
610; AVX512DQ-FCP-NEXT:    vmovd %xmm5, (%r8)
611; AVX512DQ-FCP-NEXT:    vmovd %xmm6, (%r9)
612; AVX512DQ-FCP-NEXT:    vmovd %xmm0, (%rax)
613; AVX512DQ-FCP-NEXT:    retq
614;
615; AVX512BW-LABEL: load_i8_stride6_vf4:
616; AVX512BW:       # %bb.0:
617; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
618; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
619; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
620; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u]
621; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
622; AVX512BW-NEXT:    vpor %xmm2, %xmm3, %xmm2
623; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u]
624; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
625; AVX512BW-NEXT:    vpor %xmm3, %xmm4, %xmm3
626; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u]
627; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
628; AVX512BW-NEXT:    vpor %xmm4, %xmm5, %xmm4
629; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
630; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
631; AVX512BW-NEXT:    vpor %xmm5, %xmm6, %xmm5
632; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
633; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
634; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
635; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
636; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
637; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
638; AVX512BW-NEXT:    vmovd %xmm2, (%rsi)
639; AVX512BW-NEXT:    vmovd %xmm3, (%rdx)
640; AVX512BW-NEXT:    vmovd %xmm4, (%rcx)
641; AVX512BW-NEXT:    vmovd %xmm5, (%r8)
642; AVX512BW-NEXT:    vmovd %xmm6, (%r9)
643; AVX512BW-NEXT:    vmovd %xmm0, (%rax)
644; AVX512BW-NEXT:    retq
645;
646; AVX512BW-FCP-LABEL: load_i8_stride6_vf4:
647; AVX512BW-FCP:       # %bb.0:
648; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
649; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
650; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
651; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u]
652; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
653; AVX512BW-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
654; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u]
655; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
656; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
657; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u]
658; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
659; AVX512BW-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
660; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
661; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
662; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
663; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
664; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
665; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
666; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
667; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
668; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
669; AVX512BW-FCP-NEXT:    vmovd %xmm2, (%rsi)
670; AVX512BW-FCP-NEXT:    vmovd %xmm3, (%rdx)
671; AVX512BW-FCP-NEXT:    vmovd %xmm4, (%rcx)
672; AVX512BW-FCP-NEXT:    vmovd %xmm5, (%r8)
673; AVX512BW-FCP-NEXT:    vmovd %xmm6, (%r9)
674; AVX512BW-FCP-NEXT:    vmovd %xmm0, (%rax)
675; AVX512BW-FCP-NEXT:    retq
676;
677; AVX512DQ-BW-LABEL: load_i8_stride6_vf4:
678; AVX512DQ-BW:       # %bb.0:
679; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
680; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
681; AVX512DQ-BW-NEXT:    vmovdqa 16(%rdi), %xmm1
682; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u]
683; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
684; AVX512DQ-BW-NEXT:    vpor %xmm2, %xmm3, %xmm2
685; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u]
686; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
687; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm4, %xmm3
688; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u]
689; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
690; AVX512DQ-BW-NEXT:    vpor %xmm4, %xmm5, %xmm4
691; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
692; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
693; AVX512DQ-BW-NEXT:    vpor %xmm5, %xmm6, %xmm5
694; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
695; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
696; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
697; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
698; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
699; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
700; AVX512DQ-BW-NEXT:    vmovd %xmm2, (%rsi)
701; AVX512DQ-BW-NEXT:    vmovd %xmm3, (%rdx)
702; AVX512DQ-BW-NEXT:    vmovd %xmm4, (%rcx)
703; AVX512DQ-BW-NEXT:    vmovd %xmm5, (%r8)
704; AVX512DQ-BW-NEXT:    vmovd %xmm6, (%r9)
705; AVX512DQ-BW-NEXT:    vmovd %xmm0, (%rax)
706; AVX512DQ-BW-NEXT:    retq
707;
708; AVX512DQ-BW-FCP-LABEL: load_i8_stride6_vf4:
709; AVX512DQ-BW-FCP:       # %bb.0:
710; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
711; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
712; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
713; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u]
714; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
715; AVX512DQ-BW-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
716; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u]
717; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
718; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
719; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u]
720; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
721; AVX512DQ-BW-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
722; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
723; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
724; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
725; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
726; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
727; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
728; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
729; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
730; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
731; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm2, (%rsi)
732; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm3, (%rdx)
733; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm4, (%rcx)
734; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm5, (%r8)
735; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm6, (%r9)
736; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm0, (%rax)
737; AVX512DQ-BW-FCP-NEXT:    retq
738  %wide.vec = load <24 x i8>, ptr %in.vec, align 64
739  %strided.vec0 = shufflevector <24 x i8> %wide.vec, <24 x i8> poison, <4 x i32> <i32 0, i32 6, i32 12, i32 18>
740  %strided.vec1 = shufflevector <24 x i8> %wide.vec, <24 x i8> poison, <4 x i32> <i32 1, i32 7, i32 13, i32 19>
741  %strided.vec2 = shufflevector <24 x i8> %wide.vec, <24 x i8> poison, <4 x i32> <i32 2, i32 8, i32 14, i32 20>
742  %strided.vec3 = shufflevector <24 x i8> %wide.vec, <24 x i8> poison, <4 x i32> <i32 3, i32 9, i32 15, i32 21>
743  %strided.vec4 = shufflevector <24 x i8> %wide.vec, <24 x i8> poison, <4 x i32> <i32 4, i32 10, i32 16, i32 22>
744  %strided.vec5 = shufflevector <24 x i8> %wide.vec, <24 x i8> poison, <4 x i32> <i32 5, i32 11, i32 17, i32 23>
745  store <4 x i8> %strided.vec0, ptr %out.vec0, align 64
746  store <4 x i8> %strided.vec1, ptr %out.vec1, align 64
747  store <4 x i8> %strided.vec2, ptr %out.vec2, align 64
748  store <4 x i8> %strided.vec3, ptr %out.vec3, align 64
749  store <4 x i8> %strided.vec4, ptr %out.vec4, align 64
750  store <4 x i8> %strided.vec5, ptr %out.vec5, align 64
751  ret void
752}
753
754define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
755; SSE-LABEL: load_i8_stride6_vf8:
756; SSE:       # %bb.0:
757; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
758; SSE-NEXT:    movdqa (%rdi), %xmm4
759; SSE-NEXT:    movdqa 16(%rdi), %xmm3
760; SSE-NEXT:    movdqa 32(%rdi), %xmm0
761; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,0,65535,65535,0]
762; SSE-NEXT:    movdqa %xmm4, %xmm1
763; SSE-NEXT:    pand %xmm8, %xmm1
764; SSE-NEXT:    pandn %xmm3, %xmm8
765; SSE-NEXT:    por %xmm1, %xmm8
766; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[0,2,1,3]
767; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [16711935,16711935,16711935,16711935]
768; SSE-NEXT:    pand %xmm5, %xmm1
769; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
770; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
771; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
772; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,7,6,7]
773; SSE-NEXT:    packuswb %xmm6, %xmm6
774; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
775; SSE-NEXT:    pand %xmm1, %xmm6
776; SSE-NEXT:    movdqa %xmm0, %xmm7
777; SSE-NEXT:    pand %xmm5, %xmm7
778; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[0,1,2,1]
779; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,6,5]
780; SSE-NEXT:    packuswb %xmm9, %xmm9
781; SSE-NEXT:    movdqa %xmm1, %xmm2
782; SSE-NEXT:    pandn %xmm9, %xmm2
783; SSE-NEXT:    por %xmm6, %xmm2
784; SSE-NEXT:    pxor %xmm6, %xmm6
785; SSE-NEXT:    movdqa %xmm8, %xmm9
786; SSE-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15]
787; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[2,1,0,3]
788; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7]
789; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7]
790; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,0,0,65535,65535]
791; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
792; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7]
793; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
794; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,0,4,5,6,7]
795; SSE-NEXT:    pand %xmm10, %xmm8
796; SSE-NEXT:    pandn %xmm9, %xmm10
797; SSE-NEXT:    por %xmm8, %xmm10
798; SSE-NEXT:    packuswb %xmm10, %xmm10
799; SSE-NEXT:    pand %xmm1, %xmm10
800; SSE-NEXT:    movdqa %xmm0, %xmm8
801; SSE-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
802; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
803; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[2,2,3,3]
804; SSE-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
805; SSE-NEXT:    packuswb %xmm9, %xmm9
806; SSE-NEXT:    pandn %xmm9, %xmm1
807; SSE-NEXT:    por %xmm10, %xmm1
808; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [65535,65535,0,65535,65535,0,65535,65535]
809; SSE-NEXT:    movdqa %xmm11, %xmm9
810; SSE-NEXT:    pandn %xmm3, %xmm9
811; SSE-NEXT:    movdqa %xmm4, %xmm12
812; SSE-NEXT:    pand %xmm11, %xmm12
813; SSE-NEXT:    por %xmm9, %xmm12
814; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm12[2,1,2,3,4,5,6,7]
815; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7]
816; SSE-NEXT:    pand %xmm5, %xmm9
817; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3]
818; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm9[1,2,3,0,4,5,6,7]
819; SSE-NEXT:    pshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,5,5,5,5]
820; SSE-NEXT:    packuswb %xmm13, %xmm13
821; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255]
822; SSE-NEXT:    pand %xmm9, %xmm13
823; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm7[0,3,2,3,4,5,6,7]
824; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3]
825; SSE-NEXT:    pshufhw {{.*#+}} xmm14 = xmm10[0,1,2,3,4,4,5,6]
826; SSE-NEXT:    packuswb %xmm14, %xmm14
827; SSE-NEXT:    movdqa %xmm9, %xmm10
828; SSE-NEXT:    pandn %xmm14, %xmm10
829; SSE-NEXT:    por %xmm13, %xmm10
830; SSE-NEXT:    movdqa %xmm12, %xmm13
831; SSE-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7]
832; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
833; SSE-NEXT:    pshuflw {{.*#+}} xmm13 = xmm13[3,1,2,1,4,5,6,7]
834; SSE-NEXT:    movdqa {{.*#+}} xmm14 = [0,65535,65535,0,65535,65535,65535,65535]
835; SSE-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm6[8],xmm12[9],xmm6[9],xmm12[10],xmm6[10],xmm12[11],xmm6[11],xmm12[12],xmm6[12],xmm12[13],xmm6[13],xmm12[14],xmm6[14],xmm12[15],xmm6[15]
836; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
837; SSE-NEXT:    pshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7]
838; SSE-NEXT:    pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7]
839; SSE-NEXT:    pand %xmm14, %xmm12
840; SSE-NEXT:    pandn %xmm13, %xmm14
841; SSE-NEXT:    por %xmm12, %xmm14
842; SSE-NEXT:    packuswb %xmm14, %xmm14
843; SSE-NEXT:    pand %xmm9, %xmm14
844; SSE-NEXT:    movdqa %xmm8, %xmm12
845; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[3,0]
846; SSE-NEXT:    movaps %xmm0, %xmm13
847; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[0,2]
848; SSE-NEXT:    pshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,7,5,6,7]
849; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,1,0,2]
850; SSE-NEXT:    packuswb %xmm13, %xmm13
851; SSE-NEXT:    movdqa %xmm9, %xmm12
852; SSE-NEXT:    pandn %xmm13, %xmm12
853; SSE-NEXT:    por %xmm14, %xmm12
854; SSE-NEXT:    pand %xmm11, %xmm3
855; SSE-NEXT:    pandn %xmm4, %xmm11
856; SSE-NEXT:    por %xmm3, %xmm11
857; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm11[3,1,2,0]
858; SSE-NEXT:    pand %xmm5, %xmm3
859; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
860; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3]
861; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm3[2,1,0,3,4,5,6,7]
862; SSE-NEXT:    packuswb %xmm4, %xmm4
863; SSE-NEXT:    pand %xmm9, %xmm4
864; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,7,6,7]
865; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,1,0,2]
866; SSE-NEXT:    packuswb %xmm5, %xmm5
867; SSE-NEXT:    movdqa %xmm9, %xmm3
868; SSE-NEXT:    pandn %xmm5, %xmm3
869; SSE-NEXT:    por %xmm4, %xmm3
870; SSE-NEXT:    movdqa %xmm11, %xmm4
871; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
872; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
873; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
874; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535]
875; SSE-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
876; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,7,5,6,7]
877; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
878; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,1,1,2,4,5,6,7]
879; SSE-NEXT:    pand %xmm5, %xmm6
880; SSE-NEXT:    pandn %xmm4, %xmm5
881; SSE-NEXT:    por %xmm6, %xmm5
882; SSE-NEXT:    packuswb %xmm5, %xmm5
883; SSE-NEXT:    pand %xmm9, %xmm5
884; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm8[0,0]
885; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[2,3]
886; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
887; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
888; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,7]
889; SSE-NEXT:    packuswb %xmm0, %xmm0
890; SSE-NEXT:    pandn %xmm0, %xmm9
891; SSE-NEXT:    por %xmm5, %xmm9
892; SSE-NEXT:    movq %xmm2, (%rsi)
893; SSE-NEXT:    movq %xmm1, (%rdx)
894; SSE-NEXT:    movq %xmm10, (%rcx)
895; SSE-NEXT:    movq %xmm12, (%r8)
896; SSE-NEXT:    movq %xmm3, (%r9)
897; SSE-NEXT:    movq %xmm9, (%rax)
898; SSE-NEXT:    retq
899;
900; AVX-LABEL: load_i8_stride6_vf8:
901; AVX:       # %bb.0:
902; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
903; AVX-NEXT:    vmovdqa (%rdi), %xmm1
904; AVX-NEXT:    vmovdqa 16(%rdi), %xmm2
905; AVX-NEXT:    vmovdqa 32(%rdi), %xmm0
906; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm2[2,8,14,u,u,u,u,u,u,u,u,u,u]
907; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u]
908; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
909; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
910; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7]
911; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm0[4,10,u,u,u,u,u,u,u,u]
912; AVX-NEXT:    vpor %xmm5, %xmm3, %xmm3
913; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[3,9,15,u,u,u,u,u,u,u,u,u,u]
914; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u]
915; AVX-NEXT:    vpor %xmm5, %xmm6, %xmm5
916; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4,5,6,7]
917; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm0[5,11,u,u,u,u,u,u,u,u]
918; AVX-NEXT:    vpor %xmm5, %xmm4, %xmm4
919; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u]
920; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
921; AVX-NEXT:    vpor %xmm5, %xmm6, %xmm5
922; AVX-NEXT:    vmovq {{.*#+}} xmm6 = [0,1,2,3,4,128,128,128,0,0,0,0,0,0,0,0]
923; AVX-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
924; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[0,6,12,u,u,u,u,u,u,u,u]
925; AVX-NEXT:    vpor %xmm7, %xmm5, %xmm5
926; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u]
927; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
928; AVX-NEXT:    vpor %xmm7, %xmm8, %xmm7
929; AVX-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
930; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm0[1,7,13,u,u,u,u,u,u,u,u]
931; AVX-NEXT:    vpor %xmm7, %xmm8, %xmm7
932; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[4,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
933; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u]
934; AVX-NEXT:    vpor %xmm8, %xmm9, %xmm8
935; AVX-NEXT:    vpshufb %xmm6, %xmm8, %xmm8
936; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
937; AVX-NEXT:    vpor %xmm9, %xmm8, %xmm8
938; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
939; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u,u,u,u]
940; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
941; AVX-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
942; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
943; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
944; AVX-NEXT:    vmovq %xmm3, (%rsi)
945; AVX-NEXT:    vmovq %xmm4, (%rdx)
946; AVX-NEXT:    vmovq %xmm5, (%rcx)
947; AVX-NEXT:    vmovq %xmm7, (%r8)
948; AVX-NEXT:    vmovq %xmm8, (%r9)
949; AVX-NEXT:    vmovq %xmm0, (%rax)
950; AVX-NEXT:    retq
951;
952; AVX2-LABEL: load_i8_stride6_vf8:
953; AVX2:       # %bb.0:
954; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
955; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
956; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
957; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
958; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
959; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
960; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
961; AVX2-NEXT:    vpor %xmm4, %xmm5, %xmm4
962; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
963; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
964; AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
965; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
966; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm5
967; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
968; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
969; AVX2-NEXT:    vpor %xmm6, %xmm7, %xmm6
970; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
971; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
972; AVX2-NEXT:    vpor %xmm5, %xmm3, %xmm3
973; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
974; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
975; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
976; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
977; AVX2-NEXT:    vpor %xmm5, %xmm7, %xmm5
978; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
979; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
980; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
981; AVX2-NEXT:    vmovq %xmm4, (%rsi)
982; AVX2-NEXT:    vmovq %xmm2, (%rdx)
983; AVX2-NEXT:    vmovq %xmm6, (%rcx)
984; AVX2-NEXT:    vmovq %xmm3, (%r8)
985; AVX2-NEXT:    vmovq %xmm5, (%r9)
986; AVX2-NEXT:    vmovq %xmm0, (%rax)
987; AVX2-NEXT:    vzeroupper
988; AVX2-NEXT:    retq
989;
990; AVX2-FP-LABEL: load_i8_stride6_vf8:
991; AVX2-FP:       # %bb.0:
992; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
993; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm0
994; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm1
995; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
996; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm3
997; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
998; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
999; AVX2-FP-NEXT:    vpor %xmm4, %xmm5, %xmm4
1000; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
1001; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
1002; AVX2-FP-NEXT:    vpor %xmm3, %xmm2, %xmm2
1003; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
1004; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm5
1005; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
1006; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
1007; AVX2-FP-NEXT:    vpor %xmm6, %xmm7, %xmm6
1008; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
1009; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
1010; AVX2-FP-NEXT:    vpor %xmm5, %xmm3, %xmm3
1011; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
1012; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm1
1013; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
1014; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
1015; AVX2-FP-NEXT:    vpor %xmm5, %xmm7, %xmm5
1016; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
1017; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
1018; AVX2-FP-NEXT:    vpor %xmm1, %xmm0, %xmm0
1019; AVX2-FP-NEXT:    vmovq %xmm4, (%rsi)
1020; AVX2-FP-NEXT:    vmovq %xmm2, (%rdx)
1021; AVX2-FP-NEXT:    vmovq %xmm6, (%rcx)
1022; AVX2-FP-NEXT:    vmovq %xmm3, (%r8)
1023; AVX2-FP-NEXT:    vmovq %xmm5, (%r9)
1024; AVX2-FP-NEXT:    vmovq %xmm0, (%rax)
1025; AVX2-FP-NEXT:    vzeroupper
1026; AVX2-FP-NEXT:    retq
1027;
1028; AVX2-FCP-LABEL: load_i8_stride6_vf8:
1029; AVX2-FCP:       # %bb.0:
1030; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1031; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm0
1032; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
1033; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
1034; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
1035; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
1036; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
1037; AVX2-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
1038; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
1039; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
1040; AVX2-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
1041; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
1042; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm5
1043; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
1044; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
1045; AVX2-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
1046; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
1047; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
1048; AVX2-FCP-NEXT:    vpor %xmm5, %xmm3, %xmm3
1049; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
1050; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
1051; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
1052; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
1053; AVX2-FCP-NEXT:    vpor %xmm5, %xmm7, %xmm5
1054; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
1055; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
1056; AVX2-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
1057; AVX2-FCP-NEXT:    vmovq %xmm4, (%rsi)
1058; AVX2-FCP-NEXT:    vmovq %xmm2, (%rdx)
1059; AVX2-FCP-NEXT:    vmovq %xmm6, (%rcx)
1060; AVX2-FCP-NEXT:    vmovq %xmm3, (%r8)
1061; AVX2-FCP-NEXT:    vmovq %xmm5, (%r9)
1062; AVX2-FCP-NEXT:    vmovq %xmm0, (%rax)
1063; AVX2-FCP-NEXT:    vzeroupper
1064; AVX2-FCP-NEXT:    retq
1065;
1066; AVX512-LABEL: load_i8_stride6_vf8:
1067; AVX512:       # %bb.0:
1068; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1069; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
1070; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm1
1071; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
1072; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
1073; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
1074; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
1075; AVX512-NEXT:    vpor %xmm4, %xmm5, %xmm4
1076; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
1077; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
1078; AVX512-NEXT:    vpor %xmm3, %xmm2, %xmm2
1079; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
1080; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm5
1081; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
1082; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
1083; AVX512-NEXT:    vpor %xmm6, %xmm7, %xmm6
1084; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
1085; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
1086; AVX512-NEXT:    vpor %xmm5, %xmm3, %xmm3
1087; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
1088; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
1089; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
1090; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
1091; AVX512-NEXT:    vpor %xmm5, %xmm7, %xmm5
1092; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
1093; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
1094; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
1095; AVX512-NEXT:    vmovq %xmm4, (%rsi)
1096; AVX512-NEXT:    vmovq %xmm2, (%rdx)
1097; AVX512-NEXT:    vmovq %xmm6, (%rcx)
1098; AVX512-NEXT:    vmovq %xmm3, (%r8)
1099; AVX512-NEXT:    vmovq %xmm5, (%r9)
1100; AVX512-NEXT:    vmovq %xmm0, (%rax)
1101; AVX512-NEXT:    vzeroupper
1102; AVX512-NEXT:    retq
1103;
1104; AVX512-FCP-LABEL: load_i8_stride6_vf8:
1105; AVX512-FCP:       # %bb.0:
1106; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1107; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm0
1108; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
1109; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
1110; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
1111; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
1112; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
1113; AVX512-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
1114; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
1115; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
1116; AVX512-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
1117; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
1118; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm5
1119; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
1120; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
1121; AVX512-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
1122; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
1123; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
1124; AVX512-FCP-NEXT:    vpor %xmm5, %xmm3, %xmm3
1125; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
1126; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
1127; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
1128; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
1129; AVX512-FCP-NEXT:    vpor %xmm5, %xmm7, %xmm5
1130; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
1131; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
1132; AVX512-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
1133; AVX512-FCP-NEXT:    vmovq %xmm4, (%rsi)
1134; AVX512-FCP-NEXT:    vmovq %xmm2, (%rdx)
1135; AVX512-FCP-NEXT:    vmovq %xmm6, (%rcx)
1136; AVX512-FCP-NEXT:    vmovq %xmm3, (%r8)
1137; AVX512-FCP-NEXT:    vmovq %xmm5, (%r9)
1138; AVX512-FCP-NEXT:    vmovq %xmm0, (%rax)
1139; AVX512-FCP-NEXT:    vzeroupper
1140; AVX512-FCP-NEXT:    retq
1141;
1142; AVX512DQ-LABEL: load_i8_stride6_vf8:
1143; AVX512DQ:       # %bb.0:
1144; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1145; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
1146; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm1
1147; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
1148; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
1149; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
1150; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
1151; AVX512DQ-NEXT:    vpor %xmm4, %xmm5, %xmm4
1152; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
1153; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
1154; AVX512DQ-NEXT:    vpor %xmm3, %xmm2, %xmm2
1155; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
1156; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm5
1157; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
1158; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
1159; AVX512DQ-NEXT:    vpor %xmm6, %xmm7, %xmm6
1160; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
1161; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
1162; AVX512DQ-NEXT:    vpor %xmm5, %xmm3, %xmm3
1163; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
1164; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
1165; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
1166; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
1167; AVX512DQ-NEXT:    vpor %xmm5, %xmm7, %xmm5
1168; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
1169; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
1170; AVX512DQ-NEXT:    vpor %xmm1, %xmm0, %xmm0
1171; AVX512DQ-NEXT:    vmovq %xmm4, (%rsi)
1172; AVX512DQ-NEXT:    vmovq %xmm2, (%rdx)
1173; AVX512DQ-NEXT:    vmovq %xmm6, (%rcx)
1174; AVX512DQ-NEXT:    vmovq %xmm3, (%r8)
1175; AVX512DQ-NEXT:    vmovq %xmm5, (%r9)
1176; AVX512DQ-NEXT:    vmovq %xmm0, (%rax)
1177; AVX512DQ-NEXT:    vzeroupper
1178; AVX512DQ-NEXT:    retq
1179;
1180; AVX512DQ-FCP-LABEL: load_i8_stride6_vf8:
1181; AVX512DQ-FCP:       # %bb.0:
1182; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1183; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm0
1184; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
1185; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
1186; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
1187; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
1188; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
1189; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
1190; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
1191; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
1192; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
1193; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
1194; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm5
1195; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
1196; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
1197; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
1198; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
1199; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
1200; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm3, %xmm3
1201; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
1202; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
1203; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
1204; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
1205; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm7, %xmm5
1206; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
1207; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
1208; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
1209; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%rsi)
1210; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rdx)
1211; AVX512DQ-FCP-NEXT:    vmovq %xmm6, (%rcx)
1212; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%r8)
1213; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%r9)
1214; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rax)
1215; AVX512DQ-FCP-NEXT:    vzeroupper
1216; AVX512DQ-FCP-NEXT:    retq
1217;
1218; AVX512BW-LABEL: load_i8_stride6_vf8:
1219; AVX512BW:       # %bb.0:
1220; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1221; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
1222; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm1
1223; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
1224; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
1225; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
1226; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
1227; AVX512BW-NEXT:    vpor %xmm4, %xmm5, %xmm4
1228; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
1229; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
1230; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
1231; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
1232; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm5
1233; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
1234; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
1235; AVX512BW-NEXT:    vpor %xmm6, %xmm7, %xmm6
1236; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
1237; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
1238; AVX512BW-NEXT:    vpor %xmm5, %xmm3, %xmm3
1239; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
1240; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
1241; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
1242; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
1243; AVX512BW-NEXT:    vpor %xmm5, %xmm7, %xmm5
1244; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
1245; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
1246; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1247; AVX512BW-NEXT:    vmovq %xmm4, (%rsi)
1248; AVX512BW-NEXT:    vmovq %xmm2, (%rdx)
1249; AVX512BW-NEXT:    vmovq %xmm6, (%rcx)
1250; AVX512BW-NEXT:    vmovq %xmm3, (%r8)
1251; AVX512BW-NEXT:    vmovq %xmm5, (%r9)
1252; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
1253; AVX512BW-NEXT:    vzeroupper
1254; AVX512BW-NEXT:    retq
1255;
1256; AVX512BW-FCP-LABEL: load_i8_stride6_vf8:
1257; AVX512BW-FCP:       # %bb.0:
1258; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1259; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm0
1260; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
1261; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
1262; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
1263; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
1264; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
1265; AVX512BW-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
1266; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
1267; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
1268; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
1269; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
1270; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm5
1271; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
1272; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
1273; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
1274; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
1275; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
1276; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm3, %xmm3
1277; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
1278; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
1279; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
1280; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
1281; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm7, %xmm5
1282; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
1283; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
1284; AVX512BW-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
1285; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%rsi)
1286; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
1287; AVX512BW-FCP-NEXT:    vmovq %xmm6, (%rcx)
1288; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%r8)
1289; AVX512BW-FCP-NEXT:    vmovq %xmm5, (%r9)
1290; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rax)
1291; AVX512BW-FCP-NEXT:    vzeroupper
1292; AVX512BW-FCP-NEXT:    retq
1293;
1294; AVX512DQ-BW-LABEL: load_i8_stride6_vf8:
1295; AVX512DQ-BW:       # %bb.0:
1296; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1297; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm0
1298; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm1
1299; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
1300; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
1301; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
1302; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
1303; AVX512DQ-BW-NEXT:    vpor %xmm4, %xmm5, %xmm4
1304; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
1305; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
1306; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
1307; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
1308; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm5
1309; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
1310; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
1311; AVX512DQ-BW-NEXT:    vpor %xmm6, %xmm7, %xmm6
1312; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
1313; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
1314; AVX512DQ-BW-NEXT:    vpor %xmm5, %xmm3, %xmm3
1315; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
1316; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
1317; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
1318; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
1319; AVX512DQ-BW-NEXT:    vpor %xmm5, %xmm7, %xmm5
1320; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
1321; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
1322; AVX512DQ-BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1323; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%rsi)
1324; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rdx)
1325; AVX512DQ-BW-NEXT:    vmovq %xmm6, (%rcx)
1326; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%r8)
1327; AVX512DQ-BW-NEXT:    vmovq %xmm5, (%r9)
1328; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rax)
1329; AVX512DQ-BW-NEXT:    vzeroupper
1330; AVX512DQ-BW-NEXT:    retq
1331;
1332; AVX512DQ-BW-FCP-LABEL: load_i8_stride6_vf8:
1333; AVX512DQ-BW-FCP:       # %bb.0:
1334; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1335; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm0
1336; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
1337; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
1338; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
1339; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
1340; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
1341; AVX512DQ-BW-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
1342; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
1343; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
1344; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
1345; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
1346; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm5
1347; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
1348; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
1349; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
1350; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
1351; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
1352; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm3, %xmm3
1353; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
1354; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
1355; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
1356; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
1357; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm7, %xmm5
1358; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
1359; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
1360; AVX512DQ-BW-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
1361; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%rsi)
1362; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rdx)
1363; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm6, (%rcx)
1364; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%r8)
1365; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm5, (%r9)
1366; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rax)
1367; AVX512DQ-BW-FCP-NEXT:    vzeroupper
1368; AVX512DQ-BW-FCP-NEXT:    retq
1369  %wide.vec = load <48 x i8>, ptr %in.vec, align 64
1370  %strided.vec0 = shufflevector <48 x i8> %wide.vec, <48 x i8> poison, <8 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42>
1371  %strided.vec1 = shufflevector <48 x i8> %wide.vec, <48 x i8> poison, <8 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43>
1372  %strided.vec2 = shufflevector <48 x i8> %wide.vec, <48 x i8> poison, <8 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44>
1373  %strided.vec3 = shufflevector <48 x i8> %wide.vec, <48 x i8> poison, <8 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45>
1374  %strided.vec4 = shufflevector <48 x i8> %wide.vec, <48 x i8> poison, <8 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46>
1375  %strided.vec5 = shufflevector <48 x i8> %wide.vec, <48 x i8> poison, <8 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47>
1376  store <8 x i8> %strided.vec0, ptr %out.vec0, align 64
1377  store <8 x i8> %strided.vec1, ptr %out.vec1, align 64
1378  store <8 x i8> %strided.vec2, ptr %out.vec2, align 64
1379  store <8 x i8> %strided.vec3, ptr %out.vec3, align 64
1380  store <8 x i8> %strided.vec4, ptr %out.vec4, align 64
1381  store <8 x i8> %strided.vec5, ptr %out.vec5, align 64
1382  ret void
1383}
1384
1385define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
1386; SSE-LABEL: load_i8_stride6_vf16:
1387; SSE:       # %bb.0:
1388; SSE-NEXT:    movdqa 64(%rdi), %xmm10
1389; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1390; SSE-NEXT:    movdqa (%rdi), %xmm5
1391; SSE-NEXT:    movdqa 16(%rdi), %xmm1
1392; SSE-NEXT:    movdqa 32(%rdi), %xmm7
1393; SSE-NEXT:    movdqa 48(%rdi), %xmm6
1394; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535]
1395; SSE-NEXT:    movdqa %xmm4, %xmm0
1396; SSE-NEXT:    pandn %xmm7, %xmm0
1397; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,0]
1398; SSE-NEXT:    movdqa %xmm2, %xmm3
1399; SSE-NEXT:    pandn %xmm6, %xmm3
1400; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1401; SSE-NEXT:    movdqa %xmm4, %xmm3
1402; SSE-NEXT:    pandn %xmm6, %xmm3
1403; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1404; SSE-NEXT:    pand %xmm4, %xmm6
1405; SSE-NEXT:    por %xmm0, %xmm6
1406; SSE-NEXT:    movdqa %xmm6, %xmm0
1407; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1408; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7]
1409; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
1410; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
1411; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
1412; SSE-NEXT:    packuswb %xmm3, %xmm0
1413; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535]
1414; SSE-NEXT:    movdqa %xmm8, %xmm9
1415; SSE-NEXT:    pandn %xmm0, %xmm9
1416; SSE-NEXT:    movdqa %xmm2, %xmm0
1417; SSE-NEXT:    movdqa %xmm2, %xmm11
1418; SSE-NEXT:    pandn %xmm1, %xmm11
1419; SSE-NEXT:    pand %xmm4, %xmm10
1420; SSE-NEXT:    movdqa %xmm4, %xmm2
1421; SSE-NEXT:    pandn %xmm1, %xmm2
1422; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1423; SSE-NEXT:    movdqa %xmm1, %xmm2
1424; SSE-NEXT:    movdqa %xmm5, %xmm14
1425; SSE-NEXT:    pand %xmm4, %xmm14
1426; SSE-NEXT:    movdqa 80(%rdi), %xmm3
1427; SSE-NEXT:    movdqa %xmm3, %xmm13
1428; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1429; SSE-NEXT:    pand %xmm4, %xmm13
1430; SSE-NEXT:    movdqa %xmm7, %xmm15
1431; SSE-NEXT:    pand %xmm4, %xmm7
1432; SSE-NEXT:    pand %xmm4, %xmm2
1433; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1434; SSE-NEXT:    movdqa %xmm4, %xmm12
1435; SSE-NEXT:    movdqa %xmm4, %xmm2
1436; SSE-NEXT:    pandn %xmm5, %xmm4
1437; SSE-NEXT:    pand %xmm0, %xmm5
1438; SSE-NEXT:    por %xmm11, %xmm5
1439; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm5[0,2,1,3]
1440; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
1441; SSE-NEXT:    pand %xmm1, %xmm11
1442; SSE-NEXT:    pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,6,7]
1443; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[0,2,1,3]
1444; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,1,4,5,6,7]
1445; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,7,6,7]
1446; SSE-NEXT:    packuswb %xmm0, %xmm0
1447; SSE-NEXT:    pand %xmm8, %xmm0
1448; SSE-NEXT:    por %xmm9, %xmm0
1449; SSE-NEXT:    pandn %xmm3, %xmm12
1450; SSE-NEXT:    por %xmm12, %xmm10
1451; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm10[3,1,2,0]
1452; SSE-NEXT:    pand %xmm1, %xmm9
1453; SSE-NEXT:    movdqa %xmm1, %xmm3
1454; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7]
1455; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,3,2,0]
1456; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5]
1457; SSE-NEXT:    packuswb %xmm9, %xmm9
1458; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1459; SSE-NEXT:    movdqa %xmm11, %xmm12
1460; SSE-NEXT:    pandn %xmm9, %xmm12
1461; SSE-NEXT:    pand %xmm11, %xmm0
1462; SSE-NEXT:    por %xmm0, %xmm12
1463; SSE-NEXT:    pxor %xmm9, %xmm9
1464; SSE-NEXT:    movdqa %xmm6, %xmm0
1465; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
1466; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
1467; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3]
1468; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1469; SSE-NEXT:    psrld $16, %xmm0
1470; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
1471; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7]
1472; SSE-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
1473; SSE-NEXT:    packuswb %xmm6, %xmm1
1474; SSE-NEXT:    movdqa %xmm5, %xmm0
1475; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
1476; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
1477; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
1478; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
1479; SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
1480; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
1481; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
1482; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7]
1483; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,0,0,65535,65535]
1484; SSE-NEXT:    pand %xmm6, %xmm5
1485; SSE-NEXT:    pandn %xmm0, %xmm6
1486; SSE-NEXT:    por %xmm5, %xmm6
1487; SSE-NEXT:    packuswb %xmm6, %xmm6
1488; SSE-NEXT:    pand %xmm8, %xmm6
1489; SSE-NEXT:    pandn %xmm1, %xmm8
1490; SSE-NEXT:    por %xmm8, %xmm6
1491; SSE-NEXT:    movdqa %xmm10, %xmm0
1492; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
1493; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
1494; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
1495; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm10[3,1,2,3,4,5,6,7]
1496; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
1497; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4]
1498; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,0,65535]
1499; SSE-NEXT:    pand %xmm5, %xmm1
1500; SSE-NEXT:    pandn %xmm0, %xmm5
1501; SSE-NEXT:    por %xmm1, %xmm5
1502; SSE-NEXT:    packuswb %xmm5, %xmm0
1503; SSE-NEXT:    movdqa %xmm11, %xmm10
1504; SSE-NEXT:    pandn %xmm0, %xmm10
1505; SSE-NEXT:    pand %xmm11, %xmm6
1506; SSE-NEXT:    por %xmm6, %xmm10
1507; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm15
1508; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
1509; SSE-NEXT:    movdqa %xmm15, %xmm0
1510; SSE-NEXT:    pand %xmm3, %xmm0
1511; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7]
1512; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1513; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7]
1514; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
1515; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
1516; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6]
1517; SSE-NEXT:    packuswb %xmm1, %xmm0
1518; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
1519; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm14[2,1,2,3,4,5,6,7]
1520; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
1521; SSE-NEXT:    pand %xmm3, %xmm1
1522; SSE-NEXT:    movdqa %xmm3, %xmm8
1523; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
1524; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
1525; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
1526; SSE-NEXT:    packuswb %xmm1, %xmm1
1527; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
1528; SSE-NEXT:    movdqa %xmm3, %xmm5
1529; SSE-NEXT:    pandn %xmm1, %xmm5
1530; SSE-NEXT:    pand %xmm3, %xmm0
1531; SSE-NEXT:    por %xmm0, %xmm5
1532; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1533; SSE-NEXT:    pandn %xmm6, %xmm2
1534; SSE-NEXT:    por %xmm2, %xmm13
1535; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm13[0,3,2,3,4,5,6,7]
1536; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
1537; SSE-NEXT:    pand %xmm8, %xmm0
1538; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
1539; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
1540; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
1541; SSE-NEXT:    packuswb %xmm0, %xmm0
1542; SSE-NEXT:    movdqa %xmm11, %xmm8
1543; SSE-NEXT:    pandn %xmm0, %xmm8
1544; SSE-NEXT:    pand %xmm11, %xmm5
1545; SSE-NEXT:    por %xmm5, %xmm8
1546; SSE-NEXT:    movdqa %xmm15, %xmm0
1547; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
1548; SSE-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15]
1549; SSE-NEXT:    movdqa %xmm15, %xmm1
1550; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
1551; SSE-NEXT:    movaps %xmm0, %xmm2
1552; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
1553; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm15[0,0]
1554; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[2,3]
1555; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,5,6,7]
1556; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
1557; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
1558; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
1559; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
1560; SSE-NEXT:    packuswb %xmm0, %xmm1
1561; SSE-NEXT:    movdqa %xmm14, %xmm0
1562; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
1563; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
1564; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
1565; SSE-NEXT:    punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15]
1566; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm14[0,3,2,1]
1567; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
1568; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
1569; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [0,65535,65535,0,65535,65535,65535,65535]
1570; SSE-NEXT:    pand %xmm5, %xmm2
1571; SSE-NEXT:    pandn %xmm0, %xmm5
1572; SSE-NEXT:    por %xmm2, %xmm5
1573; SSE-NEXT:    pand %xmm3, %xmm1
1574; SSE-NEXT:    packuswb %xmm5, %xmm5
1575; SSE-NEXT:    pandn %xmm5, %xmm3
1576; SSE-NEXT:    por %xmm1, %xmm3
1577; SSE-NEXT:    movdqa %xmm13, %xmm0
1578; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
1579; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
1580; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5]
1581; SSE-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7]
1582; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm13[0,2,0,3]
1583; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7]
1584; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,0]
1585; SSE-NEXT:    pand %xmm2, %xmm1
1586; SSE-NEXT:    pandn %xmm0, %xmm2
1587; SSE-NEXT:    por %xmm1, %xmm2
1588; SSE-NEXT:    pand %xmm11, %xmm3
1589; SSE-NEXT:    packuswb %xmm2, %xmm0
1590; SSE-NEXT:    pandn %xmm0, %xmm11
1591; SSE-NEXT:    por %xmm3, %xmm11
1592; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
1593; SSE-NEXT:    movdqa %xmm7, %xmm0
1594; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
1595; SSE-NEXT:    pand %xmm5, %xmm0
1596; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3]
1597; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
1598; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
1599; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
1600; SSE-NEXT:    packuswb %xmm1, %xmm0
1601; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255]
1602; SSE-NEXT:    movdqa %xmm3, %xmm1
1603; SSE-NEXT:    pandn %xmm0, %xmm1
1604; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
1605; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[3,1,2,0]
1606; SSE-NEXT:    pand %xmm5, %xmm0
1607; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
1608; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
1609; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[2,1,0,3,4,5,6,7]
1610; SSE-NEXT:    packuswb %xmm2, %xmm2
1611; SSE-NEXT:    pand %xmm3, %xmm2
1612; SSE-NEXT:    por %xmm1, %xmm2
1613; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,0,65535,65535,0]
1614; SSE-NEXT:    movdqa %xmm6, %xmm1
1615; SSE-NEXT:    pand %xmm13, %xmm1
1616; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
1617; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
1618; SSE-NEXT:    pand %xmm0, %xmm2
1619; SSE-NEXT:    por %xmm1, %xmm13
1620; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm13[0,2,1,3]
1621; SSE-NEXT:    pand %xmm5, %xmm1
1622; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
1623; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
1624; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
1625; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
1626; SSE-NEXT:    packuswb %xmm1, %xmm1
1627; SSE-NEXT:    movdqa %xmm0, %xmm6
1628; SSE-NEXT:    pandn %xmm1, %xmm6
1629; SSE-NEXT:    por %xmm2, %xmm6
1630; SSE-NEXT:    movdqa %xmm7, %xmm1
1631; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
1632; SSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
1633; SSE-NEXT:    movdqa %xmm7, %xmm2
1634; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,0],xmm1[0,0]
1635; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,0],xmm1[2,3]
1636; SSE-NEXT:    psrlq $48, %xmm1
1637; SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1638; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1639; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7]
1640; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
1641; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7]
1642; SSE-NEXT:    packuswb %xmm2, %xmm1
1643; SSE-NEXT:    movdqa %xmm4, %xmm2
1644; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
1645; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
1646; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,5,5,5,5]
1647; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,65535]
1648; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
1649; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7]
1650; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1651; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,1,1,2,4,5,6,7]
1652; SSE-NEXT:    pand %xmm2, %xmm4
1653; SSE-NEXT:    pandn %xmm5, %xmm2
1654; SSE-NEXT:    por %xmm4, %xmm2
1655; SSE-NEXT:    packuswb %xmm2, %xmm2
1656; SSE-NEXT:    pand %xmm3, %xmm2
1657; SSE-NEXT:    pandn %xmm1, %xmm3
1658; SSE-NEXT:    por %xmm3, %xmm2
1659; SSE-NEXT:    movdqa %xmm13, %xmm1
1660; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
1661; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1]
1662; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
1663; SSE-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15]
1664; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,0,0]
1665; SSE-NEXT:    pand %xmm3, %xmm1
1666; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,7,5,6,7]
1667; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0]
1668; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,7,4]
1669; SSE-NEXT:    pandn %xmm4, %xmm3
1670; SSE-NEXT:    por %xmm1, %xmm3
1671; SSE-NEXT:    pand %xmm0, %xmm2
1672; SSE-NEXT:    packuswb %xmm3, %xmm1
1673; SSE-NEXT:    pandn %xmm1, %xmm0
1674; SSE-NEXT:    por %xmm2, %xmm0
1675; SSE-NEXT:    movdqa %xmm12, (%rsi)
1676; SSE-NEXT:    movdqa %xmm10, (%rdx)
1677; SSE-NEXT:    movdqa %xmm8, (%rcx)
1678; SSE-NEXT:    movdqa %xmm11, (%r8)
1679; SSE-NEXT:    movdqa %xmm6, (%r9)
1680; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1681; SSE-NEXT:    movdqa %xmm0, (%rax)
1682; SSE-NEXT:    retq
1683;
1684; AVX-LABEL: load_i8_stride6_vf16:
1685; AVX:       # %bb.0:
1686; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1687; AVX-NEXT:    vmovdqa (%rdi), %xmm1
1688; AVX-NEXT:    vmovdqa 16(%rdi), %xmm2
1689; AVX-NEXT:    vmovdqa 32(%rdi), %xmm0
1690; AVX-NEXT:    vmovdqa 48(%rdi), %xmm3
1691; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u]
1692; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u]
1693; AVX-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
1694; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[2,8,14,u,u,u,u,u,u,u,u,u,u]
1695; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u]
1696; AVX-NEXT:    vpor %xmm5, %xmm6, %xmm5
1697; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7]
1698; AVX-NEXT:    vmovdqa 80(%rdi), %xmm4
1699; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[4,10]
1700; AVX-NEXT:    vmovdqa 64(%rdi), %xmm5
1701; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
1702; AVX-NEXT:    vpor %xmm7, %xmm8, %xmm7
1703; AVX-NEXT:    vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215]
1704; AVX-NEXT:    vpblendvb %xmm9, %xmm6, %xmm7, %xmm6
1705; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u]
1706; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u]
1707; AVX-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
1708; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm2[3,9,15,u,u,u,u,u,u,u,u,u,u]
1709; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u]
1710; AVX-NEXT:    vpor %xmm8, %xmm10, %xmm8
1711; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5],xmm8[6,7]
1712; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[5,11]
1713; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
1714; AVX-NEXT:    vpor %xmm8, %xmm10, %xmm8
1715; AVX-NEXT:    vpblendvb %xmm9, %xmm7, %xmm8, %xmm7
1716; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm3[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u]
1717; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,0,6,12,u,u,u,u,u,u,u,u]
1718; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm10[0],xmm8[0]
1719; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u]
1720; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
1721; AVX-NEXT:    vpor %xmm10, %xmm11, %xmm10
1722; AVX-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u]
1723; AVX-NEXT:    vpblendvb %xmm11, %xmm8, %xmm10, %xmm8
1724; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
1725; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[0,6,12]
1726; AVX-NEXT:    vpor %xmm10, %xmm12, %xmm10
1727; AVX-NEXT:    vpblendvb %xmm9, %xmm8, %xmm10, %xmm8
1728; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm3[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u]
1729; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,1,7,13,u,u,u,u,u,u,u,u]
1730; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm12[0],xmm10[0]
1731; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u]
1732; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
1733; AVX-NEXT:    vpor %xmm12, %xmm13, %xmm12
1734; AVX-NEXT:    vpblendvb %xmm11, %xmm10, %xmm12, %xmm10
1735; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
1736; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[1,7,13]
1737; AVX-NEXT:    vpor %xmm11, %xmm12, %xmm11
1738; AVX-NEXT:    vpblendvb %xmm9, %xmm10, %xmm11, %xmm9
1739; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm1[4,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
1740; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u]
1741; AVX-NEXT:    vpor %xmm10, %xmm11, %xmm10
1742; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u]
1743; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14]
1744; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm11 = xmm12[1],xmm11[1]
1745; AVX-NEXT:    vmovq {{.*#+}} xmm12 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
1746; AVX-NEXT:    vpblendvb %xmm12, %xmm10, %xmm11, %xmm10
1747; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[2,8,14]
1748; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero
1749; AVX-NEXT:    vpor %xmm11, %xmm13, %xmm11
1750; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm11[5,6,7]
1751; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
1752; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u,u,u,u]
1753; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
1754; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u]
1755; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15]
1756; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1]
1757; AVX-NEXT:    vpblendvb %xmm12, %xmm1, %xmm0, %xmm0
1758; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[3,9,15]
1759; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero
1760; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
1761; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
1762; AVX-NEXT:    vmovdqa %xmm6, (%rsi)
1763; AVX-NEXT:    vmovdqa %xmm7, (%rdx)
1764; AVX-NEXT:    vmovdqa %xmm8, (%rcx)
1765; AVX-NEXT:    vmovdqa %xmm9, (%r8)
1766; AVX-NEXT:    vmovdqa %xmm10, (%r9)
1767; AVX-NEXT:    vmovdqa %xmm0, (%rax)
1768; AVX-NEXT:    retq
1769;
1770; AVX2-LABEL: load_i8_stride6_vf16:
1771; AVX2:       # %bb.0:
1772; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1773; AVX2-NEXT:    vmovdqa (%rdi), %ymm3
1774; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm4
1775; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
1776; AVX2-NEXT:    vpblendvb %ymm0, %ymm3, %ymm4, %ymm5
1777; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u]
1778; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
1779; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u]
1780; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm2
1781; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm0
1782; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[4,10]
1783; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm1
1784; AVX2-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
1785; AVX2-NEXT:    vpor %xmm7, %xmm8, %xmm7
1786; AVX2-NEXT:    vpmovsxdq {{.*#+}} xmm8 = [18446744073709551615,16777215]
1787; AVX2-NEXT:    vpblendvb %xmm8, %xmm2, %xmm7, %xmm2
1788; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u]
1789; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u]
1790; AVX2-NEXT:    vpor %xmm5, %xmm6, %xmm5
1791; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,11]
1792; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
1793; AVX2-NEXT:    vpor %xmm6, %xmm7, %xmm6
1794; AVX2-NEXT:    vpblendvb %xmm8, %xmm5, %xmm6, %xmm5
1795; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
1796; AVX2-NEXT:    vpblendvb %ymm6, %ymm4, %ymm3, %ymm6
1797; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7
1798; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[2,8,14,u,u,u,u,u]
1799; AVX2-NEXT:    vpshufb {{.*#+}} xmm10 = xmm6[2,8,14],zero,zero,xmm6[0,6,12],zero,zero,zero,xmm6[u,u,u,u,u]
1800; AVX2-NEXT:    vpor %xmm9, %xmm10, %xmm9
1801; AVX2-NEXT:    vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
1802; AVX2-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,6,12]
1803; AVX2-NEXT:    vpor %xmm10, %xmm11, %xmm10
1804; AVX2-NEXT:    vpblendvb %xmm8, %xmm9, %xmm10, %xmm9
1805; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[3,9,15,u,u,u,u,u]
1806; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[3,9,15],zero,zero,xmm6[1,7,13],zero,zero,zero,xmm6[u,u,u,u,u]
1807; AVX2-NEXT:    vpor %xmm7, %xmm6, %xmm6
1808; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
1809; AVX2-NEXT:    vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,7,13]
1810; AVX2-NEXT:    vpor %xmm7, %xmm10, %xmm7
1811; AVX2-NEXT:    vpblendvb %xmm8, %xmm6, %xmm7, %xmm6
1812; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
1813; AVX2-NEXT:    vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
1814; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
1815; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u]
1816; AVX2-NEXT:    vpshufb {{.*#+}} xmm8 = xmm3[4,10],zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u]
1817; AVX2-NEXT:    vpor %xmm7, %xmm8, %xmm7
1818; AVX2-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,8,14]
1819; AVX2-NEXT:    vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero
1820; AVX2-NEXT:    vpor %xmm8, %xmm10, %xmm8
1821; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7]
1822; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u]
1823; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[5,11],zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u]
1824; AVX2-NEXT:    vpor %xmm4, %xmm3, %xmm3
1825; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,9,15]
1826; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero
1827; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
1828; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
1829; AVX2-NEXT:    vmovdqa %xmm2, (%rsi)
1830; AVX2-NEXT:    vmovdqa %xmm5, (%rdx)
1831; AVX2-NEXT:    vmovdqa %xmm9, (%rcx)
1832; AVX2-NEXT:    vmovdqa %xmm6, (%r8)
1833; AVX2-NEXT:    vmovdqa %xmm7, (%r9)
1834; AVX2-NEXT:    vmovdqa %xmm0, (%rax)
1835; AVX2-NEXT:    vzeroupper
1836; AVX2-NEXT:    retq
1837;
1838; AVX2-FP-LABEL: load_i8_stride6_vf16:
1839; AVX2-FP:       # %bb.0:
1840; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1841; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm3
1842; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm4
1843; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
1844; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm3, %ymm4, %ymm5
1845; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u]
1846; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm6
1847; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u]
1848; AVX2-FP-NEXT:    vpor %xmm0, %xmm1, %xmm2
1849; AVX2-FP-NEXT:    vmovdqa 80(%rdi), %xmm0
1850; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[4,10]
1851; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm1
1852; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
1853; AVX2-FP-NEXT:    vpor %xmm7, %xmm8, %xmm7
1854; AVX2-FP-NEXT:    vpmovsxdq {{.*#+}} xmm8 = [18446744073709551615,16777215]
1855; AVX2-FP-NEXT:    vpblendvb %xmm8, %xmm2, %xmm7, %xmm2
1856; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u]
1857; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u]
1858; AVX2-FP-NEXT:    vpor %xmm5, %xmm6, %xmm5
1859; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,11]
1860; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
1861; AVX2-FP-NEXT:    vpor %xmm6, %xmm7, %xmm6
1862; AVX2-FP-NEXT:    vpblendvb %xmm8, %xmm5, %xmm6, %xmm5
1863; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
1864; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm4, %ymm3, %ymm6
1865; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm7
1866; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[2,8,14,u,u,u,u,u]
1867; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm6[2,8,14],zero,zero,xmm6[0,6,12],zero,zero,zero,xmm6[u,u,u,u,u]
1868; AVX2-FP-NEXT:    vpor %xmm9, %xmm10, %xmm9
1869; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
1870; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,6,12]
1871; AVX2-FP-NEXT:    vpor %xmm10, %xmm11, %xmm10
1872; AVX2-FP-NEXT:    vpblendvb %xmm8, %xmm9, %xmm10, %xmm9
1873; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[3,9,15,u,u,u,u,u]
1874; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[3,9,15],zero,zero,xmm6[1,7,13],zero,zero,zero,xmm6[u,u,u,u,u]
1875; AVX2-FP-NEXT:    vpor %xmm7, %xmm6, %xmm6
1876; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
1877; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,7,13]
1878; AVX2-FP-NEXT:    vpor %xmm7, %xmm10, %xmm7
1879; AVX2-FP-NEXT:    vpblendvb %xmm8, %xmm6, %xmm7, %xmm6
1880; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
1881; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
1882; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm4
1883; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u]
1884; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm3[4,10],zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u]
1885; AVX2-FP-NEXT:    vpor %xmm7, %xmm8, %xmm7
1886; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,8,14]
1887; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero
1888; AVX2-FP-NEXT:    vpor %xmm8, %xmm10, %xmm8
1889; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7]
1890; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u]
1891; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[5,11],zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u]
1892; AVX2-FP-NEXT:    vpor %xmm4, %xmm3, %xmm3
1893; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,9,15]
1894; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero
1895; AVX2-FP-NEXT:    vpor %xmm0, %xmm1, %xmm0
1896; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
1897; AVX2-FP-NEXT:    vmovdqa %xmm2, (%rsi)
1898; AVX2-FP-NEXT:    vmovdqa %xmm5, (%rdx)
1899; AVX2-FP-NEXT:    vmovdqa %xmm9, (%rcx)
1900; AVX2-FP-NEXT:    vmovdqa %xmm6, (%r8)
1901; AVX2-FP-NEXT:    vmovdqa %xmm7, (%r9)
1902; AVX2-FP-NEXT:    vmovdqa %xmm0, (%rax)
1903; AVX2-FP-NEXT:    vzeroupper
1904; AVX2-FP-NEXT:    retq
1905;
1906; AVX2-FCP-LABEL: load_i8_stride6_vf16:
1907; AVX2-FCP:       # %bb.0:
1908; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1909; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm3
1910; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm4
1911; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
1912; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm3, %ymm4, %ymm5
1913; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u]
1914; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
1915; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u]
1916; AVX2-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm2
1917; AVX2-FCP-NEXT:    vmovdqa 80(%rdi), %xmm0
1918; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[4,10]
1919; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm1
1920; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
1921; AVX2-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
1922; AVX2-FCP-NEXT:    vpmovsxdq {{.*#+}} xmm8 = [18446744073709551615,16777215]
1923; AVX2-FCP-NEXT:    vpblendvb %xmm8, %xmm2, %xmm7, %xmm2
1924; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u]
1925; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u]
1926; AVX2-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
1927; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,11]
1928; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
1929; AVX2-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
1930; AVX2-FCP-NEXT:    vpblendvb %xmm8, %xmm5, %xmm6, %xmm5
1931; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
1932; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm4, %ymm3, %ymm6
1933; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
1934; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[2,8,14,u,u,u,u,u]
1935; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm6[2,8,14],zero,zero,xmm6[0,6,12],zero,zero,zero,xmm6[u,u,u,u,u]
1936; AVX2-FCP-NEXT:    vpor %xmm9, %xmm10, %xmm9
1937; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
1938; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,6,12]
1939; AVX2-FCP-NEXT:    vpor %xmm10, %xmm11, %xmm10
1940; AVX2-FCP-NEXT:    vpblendvb %xmm8, %xmm9, %xmm10, %xmm9
1941; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[3,9,15,u,u,u,u,u]
1942; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[3,9,15],zero,zero,xmm6[1,7,13],zero,zero,zero,xmm6[u,u,u,u,u]
1943; AVX2-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
1944; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
1945; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,7,13]
1946; AVX2-FCP-NEXT:    vpor %xmm7, %xmm10, %xmm7
1947; AVX2-FCP-NEXT:    vpblendvb %xmm8, %xmm6, %xmm7, %xmm6
1948; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
1949; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
1950; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
1951; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u]
1952; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm3[4,10],zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u]
1953; AVX2-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
1954; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,8,14]
1955; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero
1956; AVX2-FCP-NEXT:    vpor %xmm8, %xmm10, %xmm8
1957; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7]
1958; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u]
1959; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[5,11],zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u]
1960; AVX2-FCP-NEXT:    vpor %xmm4, %xmm3, %xmm3
1961; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,9,15]
1962; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero
1963; AVX2-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
1964; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
1965; AVX2-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
1966; AVX2-FCP-NEXT:    vmovdqa %xmm5, (%rdx)
1967; AVX2-FCP-NEXT:    vmovdqa %xmm9, (%rcx)
1968; AVX2-FCP-NEXT:    vmovdqa %xmm6, (%r8)
1969; AVX2-FCP-NEXT:    vmovdqa %xmm7, (%r9)
1970; AVX2-FCP-NEXT:    vmovdqa %xmm0, (%rax)
1971; AVX2-FCP-NEXT:    vzeroupper
1972; AVX2-FCP-NEXT:    retq
1973;
1974; AVX512-LABEL: load_i8_stride6_vf16:
1975; AVX512:       # %bb.0:
1976; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1977; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
1978; AVX512-NEXT:    vmovdqa (%rdi), %ymm3
1979; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm4
1980; AVX512-NEXT:    vmovdqa %ymm0, %ymm5
1981; AVX512-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm4 ^ (ymm5 & (ymm3 ^ ymm4))
1982; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u]
1983; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm6
1984; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u]
1985; AVX512-NEXT:    vpor %xmm1, %xmm2, %xmm7
1986; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm2
1987; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[4,10]
1988; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm1
1989; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
1990; AVX512-NEXT:    vpor %xmm8, %xmm9, %xmm8
1991; AVX512-NEXT:    vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215]
1992; AVX512-NEXT:    vpternlogq {{.*#+}} xmm8 = xmm8 ^ (xmm9 & (xmm8 ^ xmm7))
1993; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u]
1994; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u]
1995; AVX512-NEXT:    vpor %xmm5, %xmm6, %xmm5
1996; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11]
1997; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
1998; AVX512-NEXT:    vpor %xmm6, %xmm7, %xmm6
1999; AVX512-NEXT:    vpternlogq {{.*#+}} xmm6 = xmm6 ^ (xmm9 & (xmm6 ^ xmm5))
2000; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
2001; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12]
2002; AVX512-NEXT:    vpor %xmm5, %xmm7, %xmm5
2003; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
2004; AVX512-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm3 ^ (ymm7 & (ymm4 ^ ymm3))
2005; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm10
2006; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u]
2007; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm7[2,8,14],zero,zero,xmm7[0,6,12],zero,zero,zero,xmm7[u,u,u,u,u]
2008; AVX512-NEXT:    vpor %xmm11, %xmm12, %xmm11
2009; AVX512-NEXT:    vpternlogq {{.*#+}} xmm11 = xmm5 ^ (xmm9 & (xmm11 ^ xmm5))
2010; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
2011; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13]
2012; AVX512-NEXT:    vpor %xmm5, %xmm12, %xmm5
2013; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u]
2014; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u]
2015; AVX512-NEXT:    vpor %xmm7, %xmm10, %xmm7
2016; AVX512-NEXT:    vpternlogq {{.*#+}} xmm7 = xmm5 ^ (xmm9 & (xmm7 ^ xmm5))
2017; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14]
2018; AVX512-NEXT:    vpshufb %xmm5, %xmm2, %xmm9
2019; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm3 ^ (ymm0 & (ymm4 ^ ymm3))
2020; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm3
2021; AVX512-NEXT:    vpshufb %xmm5, %xmm3, %xmm4
2022; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7]
2023; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128]
2024; AVX512-NEXT:    vpshufb %xmm5, %xmm1, %xmm9
2025; AVX512-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
2026; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm9[5,6,7]
2027; AVX512-NEXT:    vpor %xmm4, %xmm5, %xmm4
2028; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15]
2029; AVX512-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
2030; AVX512-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
2031; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7]
2032; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128]
2033; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2034; AVX512-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2035; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
2036; AVX512-NEXT:    vpor %xmm2, %xmm0, %xmm0
2037; AVX512-NEXT:    vmovdqa %xmm8, (%rsi)
2038; AVX512-NEXT:    vmovdqa %xmm6, (%rdx)
2039; AVX512-NEXT:    vmovdqa %xmm11, (%rcx)
2040; AVX512-NEXT:    vmovdqa %xmm7, (%r8)
2041; AVX512-NEXT:    vmovdqa %xmm4, (%r9)
2042; AVX512-NEXT:    vmovdqa %xmm0, (%rax)
2043; AVX512-NEXT:    vzeroupper
2044; AVX512-NEXT:    retq
2045;
2046; AVX512-FCP-LABEL: load_i8_stride6_vf16:
2047; AVX512-FCP:       # %bb.0:
2048; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2049; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
2050; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm3
2051; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm4
2052; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm5
2053; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm4 ^ (ymm5 & (ymm3 ^ ymm4))
2054; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u]
2055; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
2056; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u]
2057; AVX512-FCP-NEXT:    vpor %xmm1, %xmm2, %xmm7
2058; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm2
2059; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[4,10]
2060; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm1
2061; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
2062; AVX512-FCP-NEXT:    vpor %xmm8, %xmm9, %xmm8
2063; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215]
2064; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} xmm8 = xmm8 ^ (xmm9 & (xmm8 ^ xmm7))
2065; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u]
2066; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u]
2067; AVX512-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
2068; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11]
2069; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
2070; AVX512-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
2071; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} xmm6 = xmm6 ^ (xmm9 & (xmm6 ^ xmm5))
2072; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
2073; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12]
2074; AVX512-FCP-NEXT:    vpor %xmm5, %xmm7, %xmm5
2075; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
2076; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm3 ^ (ymm7 & (ymm4 ^ ymm3))
2077; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm10
2078; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u]
2079; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm7[2,8,14],zero,zero,xmm7[0,6,12],zero,zero,zero,xmm7[u,u,u,u,u]
2080; AVX512-FCP-NEXT:    vpor %xmm11, %xmm12, %xmm11
2081; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} xmm11 = xmm5 ^ (xmm9 & (xmm11 ^ xmm5))
2082; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
2083; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13]
2084; AVX512-FCP-NEXT:    vpor %xmm5, %xmm12, %xmm5
2085; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u]
2086; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u]
2087; AVX512-FCP-NEXT:    vpor %xmm7, %xmm10, %xmm7
2088; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} xmm7 = xmm5 ^ (xmm9 & (xmm7 ^ xmm5))
2089; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14]
2090; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm2, %xmm9
2091; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm3 ^ (ymm0 & (ymm4 ^ ymm3))
2092; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
2093; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm4
2094; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7]
2095; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128]
2096; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm9
2097; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
2098; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm9[5,6,7]
2099; AVX512-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
2100; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15]
2101; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
2102; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
2103; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7]
2104; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128]
2105; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2106; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2107; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
2108; AVX512-FCP-NEXT:    vpor %xmm2, %xmm0, %xmm0
2109; AVX512-FCP-NEXT:    vmovdqa %xmm8, (%rsi)
2110; AVX512-FCP-NEXT:    vmovdqa %xmm6, (%rdx)
2111; AVX512-FCP-NEXT:    vmovdqa %xmm11, (%rcx)
2112; AVX512-FCP-NEXT:    vmovdqa %xmm7, (%r8)
2113; AVX512-FCP-NEXT:    vmovdqa %xmm4, (%r9)
2114; AVX512-FCP-NEXT:    vmovdqa %xmm0, (%rax)
2115; AVX512-FCP-NEXT:    vzeroupper
2116; AVX512-FCP-NEXT:    retq
2117;
2118; AVX512DQ-LABEL: load_i8_stride6_vf16:
2119; AVX512DQ:       # %bb.0:
2120; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2121; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
2122; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm3
2123; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm4
2124; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm5
2125; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm4 ^ (ymm5 & (ymm3 ^ ymm4))
2126; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u]
2127; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm6
2128; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u]
2129; AVX512DQ-NEXT:    vpor %xmm1, %xmm2, %xmm7
2130; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm2
2131; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[4,10]
2132; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm1
2133; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
2134; AVX512DQ-NEXT:    vpor %xmm8, %xmm9, %xmm8
2135; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215]
2136; AVX512DQ-NEXT:    vpternlogq {{.*#+}} xmm8 = xmm8 ^ (xmm9 & (xmm8 ^ xmm7))
2137; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u]
2138; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u]
2139; AVX512DQ-NEXT:    vpor %xmm5, %xmm6, %xmm5
2140; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11]
2141; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
2142; AVX512DQ-NEXT:    vpor %xmm6, %xmm7, %xmm6
2143; AVX512DQ-NEXT:    vpternlogq {{.*#+}} xmm6 = xmm6 ^ (xmm9 & (xmm6 ^ xmm5))
2144; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
2145; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12]
2146; AVX512DQ-NEXT:    vpor %xmm5, %xmm7, %xmm5
2147; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
2148; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm3 ^ (ymm7 & (ymm4 ^ ymm3))
2149; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm10
2150; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u]
2151; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm7[2,8,14],zero,zero,xmm7[0,6,12],zero,zero,zero,xmm7[u,u,u,u,u]
2152; AVX512DQ-NEXT:    vpor %xmm11, %xmm12, %xmm11
2153; AVX512DQ-NEXT:    vpternlogq {{.*#+}} xmm11 = xmm5 ^ (xmm9 & (xmm11 ^ xmm5))
2154; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
2155; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13]
2156; AVX512DQ-NEXT:    vpor %xmm5, %xmm12, %xmm5
2157; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u]
2158; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u]
2159; AVX512DQ-NEXT:    vpor %xmm7, %xmm10, %xmm7
2160; AVX512DQ-NEXT:    vpternlogq {{.*#+}} xmm7 = xmm5 ^ (xmm9 & (xmm7 ^ xmm5))
2161; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14]
2162; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm2, %xmm9
2163; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm3 ^ (ymm0 & (ymm4 ^ ymm3))
2164; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
2165; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm3, %xmm4
2166; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7]
2167; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128]
2168; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm1, %xmm9
2169; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
2170; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm9[5,6,7]
2171; AVX512DQ-NEXT:    vpor %xmm4, %xmm5, %xmm4
2172; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15]
2173; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
2174; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
2175; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7]
2176; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128]
2177; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2178; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2179; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
2180; AVX512DQ-NEXT:    vpor %xmm2, %xmm0, %xmm0
2181; AVX512DQ-NEXT:    vmovdqa %xmm8, (%rsi)
2182; AVX512DQ-NEXT:    vmovdqa %xmm6, (%rdx)
2183; AVX512DQ-NEXT:    vmovdqa %xmm11, (%rcx)
2184; AVX512DQ-NEXT:    vmovdqa %xmm7, (%r8)
2185; AVX512DQ-NEXT:    vmovdqa %xmm4, (%r9)
2186; AVX512DQ-NEXT:    vmovdqa %xmm0, (%rax)
2187; AVX512DQ-NEXT:    vzeroupper
2188; AVX512DQ-NEXT:    retq
2189;
2190; AVX512DQ-FCP-LABEL: load_i8_stride6_vf16:
2191; AVX512DQ-FCP:       # %bb.0:
2192; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2193; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
2194; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm3
2195; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm4
2196; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm5
2197; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm4 ^ (ymm5 & (ymm3 ^ ymm4))
2198; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u]
2199; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
2200; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u]
2201; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm2, %xmm7
2202; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdi), %xmm2
2203; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[4,10]
2204; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm1
2205; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
2206; AVX512DQ-FCP-NEXT:    vpor %xmm8, %xmm9, %xmm8
2207; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} xmm9 = [18446744073709551615,16777215]
2208; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} xmm8 = xmm8 ^ (xmm9 & (xmm8 ^ xmm7))
2209; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u]
2210; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u]
2211; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
2212; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11]
2213; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
2214; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
2215; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} xmm6 = xmm6 ^ (xmm9 & (xmm6 ^ xmm5))
2216; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
2217; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12]
2218; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm7, %xmm5
2219; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
2220; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm3 ^ (ymm7 & (ymm4 ^ ymm3))
2221; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm10
2222; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u]
2223; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm7[2,8,14],zero,zero,xmm7[0,6,12],zero,zero,zero,xmm7[u,u,u,u,u]
2224; AVX512DQ-FCP-NEXT:    vpor %xmm11, %xmm12, %xmm11
2225; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} xmm11 = xmm5 ^ (xmm9 & (xmm11 ^ xmm5))
2226; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
2227; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13]
2228; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm12, %xmm5
2229; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u]
2230; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u]
2231; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm10, %xmm7
2232; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} xmm7 = xmm5 ^ (xmm9 & (xmm7 ^ xmm5))
2233; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14]
2234; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm2, %xmm9
2235; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm3 ^ (ymm0 & (ymm4 ^ ymm3))
2236; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
2237; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm4
2238; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm9[5,6,7]
2239; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128]
2240; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm9
2241; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
2242; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm9[5,6,7]
2243; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
2244; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15]
2245; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
2246; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
2247; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7]
2248; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128]
2249; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2250; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2251; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
2252; AVX512DQ-FCP-NEXT:    vpor %xmm2, %xmm0, %xmm0
2253; AVX512DQ-FCP-NEXT:    vmovdqa %xmm8, (%rsi)
2254; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, (%rdx)
2255; AVX512DQ-FCP-NEXT:    vmovdqa %xmm11, (%rcx)
2256; AVX512DQ-FCP-NEXT:    vmovdqa %xmm7, (%r8)
2257; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, (%r9)
2258; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%rax)
2259; AVX512DQ-FCP-NEXT:    vzeroupper
2260; AVX512DQ-FCP-NEXT:    retq
2261;
2262; AVX512BW-LABEL: load_i8_stride6_vf16:
2263; AVX512BW:       # %bb.0:
2264; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2265; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
2266; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm0
2267; AVX512BW-NEXT:    movw $18724, %r10w # imm = 0x4924
2268; AVX512BW-NEXT:    kmovd %r10d, %k1
2269; AVX512BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm2 {%k1}
2270; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[u,u,u,u,u]
2271; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm4
2272; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm4[2,8,14],zero,zero,xmm4[0,6,12,u,u,u,u,u]
2273; AVX512BW-NEXT:    vpor %xmm3, %xmm5, %xmm3
2274; AVX512BW-NEXT:    vmovdqa 80(%rdi), %xmm5
2275; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[4,10]
2276; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm7
2277; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
2278; AVX512BW-NEXT:    vpor %xmm6, %xmm8, %xmm6
2279; AVX512BW-NEXT:    movw $-2048, %di # imm = 0xF800
2280; AVX512BW-NEXT:    kmovd %edi, %k2
2281; AVX512BW-NEXT:    vmovdqu8 %xmm6, %xmm3 {%k2}
2282; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[u,u,u,u,u]
2283; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[3,9,15],zero,zero,xmm4[1,7,13,u,u,u,u,u]
2284; AVX512BW-NEXT:    vpor %xmm2, %xmm4, %xmm2
2285; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,11]
2286; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
2287; AVX512BW-NEXT:    vpor %xmm4, %xmm6, %xmm4
2288; AVX512BW-NEXT:    vmovdqu8 %xmm4, %xmm2 {%k2}
2289; AVX512BW-NEXT:    movw $9362, %di # imm = 0x2492
2290; AVX512BW-NEXT:    kmovd %edi, %k3
2291; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm0, %ymm4 {%k3}
2292; AVX512BW-NEXT:    vextracti128 $1, %ymm4, %xmm6
2293; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14,u,u,u,u,u]
2294; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm4[2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[u,u,u,u,u]
2295; AVX512BW-NEXT:    vpor %xmm8, %xmm9, %xmm8
2296; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
2297; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,6,12]
2298; AVX512BW-NEXT:    vpor %xmm9, %xmm10, %xmm9
2299; AVX512BW-NEXT:    vmovdqu8 %xmm9, %xmm8 {%k2}
2300; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15,u,u,u,u,u]
2301; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[u,u,u,u,u]
2302; AVX512BW-NEXT:    vpor %xmm6, %xmm4, %xmm4
2303; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
2304; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,7,13]
2305; AVX512BW-NEXT:    vpor %xmm6, %xmm9, %xmm6
2306; AVX512BW-NEXT:    vmovdqu8 %xmm6, %xmm4 {%k2}
2307; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14]
2308; AVX512BW-NEXT:    vpshufb %xmm6, %xmm5, %xmm9
2309; AVX512BW-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
2310; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
2311; AVX512BW-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
2312; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm9[5,6,7]
2313; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm9 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128]
2314; AVX512BW-NEXT:    vpshufb %xmm9, %xmm7, %xmm10
2315; AVX512BW-NEXT:    vpshufb %xmm9, %xmm0, %xmm9
2316; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7]
2317; AVX512BW-NEXT:    vpor %xmm6, %xmm9, %xmm6
2318; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm9 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15]
2319; AVX512BW-NEXT:    vpshufb %xmm9, %xmm5, %xmm5
2320; AVX512BW-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
2321; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
2322; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128]
2323; AVX512BW-NEXT:    vpshufb %xmm5, %xmm7, %xmm7
2324; AVX512BW-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
2325; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7]
2326; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
2327; AVX512BW-NEXT:    vmovdqa %xmm3, (%rsi)
2328; AVX512BW-NEXT:    vmovdqa %xmm2, (%rdx)
2329; AVX512BW-NEXT:    vmovdqa %xmm8, (%rcx)
2330; AVX512BW-NEXT:    vmovdqa %xmm4, (%r8)
2331; AVX512BW-NEXT:    vmovdqa %xmm6, (%r9)
2332; AVX512BW-NEXT:    vmovdqa %xmm0, (%rax)
2333; AVX512BW-NEXT:    vzeroupper
2334; AVX512BW-NEXT:    retq
2335;
2336; AVX512BW-FCP-LABEL: load_i8_stride6_vf16:
2337; AVX512BW-FCP:       # %bb.0:
2338; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2339; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
2340; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
2341; AVX512BW-FCP-NEXT:    movw $18724, %r10w # imm = 0x4924
2342; AVX512BW-FCP-NEXT:    kmovd %r10d, %k1
2343; AVX512BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm2 {%k1}
2344; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[u,u,u,u,u]
2345; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
2346; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm4[2,8,14],zero,zero,xmm4[0,6,12,u,u,u,u,u]
2347; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm5, %xmm3
2348; AVX512BW-FCP-NEXT:    vmovdqa 80(%rdi), %xmm5
2349; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[4,10]
2350; AVX512BW-FCP-NEXT:    vmovdqa 64(%rdi), %xmm7
2351; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
2352; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm8, %xmm6
2353; AVX512BW-FCP-NEXT:    movw $-2048, %di # imm = 0xF800
2354; AVX512BW-FCP-NEXT:    kmovd %edi, %k2
2355; AVX512BW-FCP-NEXT:    vmovdqu8 %xmm6, %xmm3 {%k2}
2356; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[u,u,u,u,u]
2357; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[3,9,15],zero,zero,xmm4[1,7,13,u,u,u,u,u]
2358; AVX512BW-FCP-NEXT:    vpor %xmm2, %xmm4, %xmm2
2359; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,11]
2360; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
2361; AVX512BW-FCP-NEXT:    vpor %xmm4, %xmm6, %xmm4
2362; AVX512BW-FCP-NEXT:    vmovdqu8 %xmm4, %xmm2 {%k2}
2363; AVX512BW-FCP-NEXT:    movw $9362, %di # imm = 0x2492
2364; AVX512BW-FCP-NEXT:    kmovd %edi, %k3
2365; AVX512BW-FCP-NEXT:    vpblendmw %ymm1, %ymm0, %ymm4 {%k3}
2366; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm6
2367; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14,u,u,u,u,u]
2368; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm4[2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[u,u,u,u,u]
2369; AVX512BW-FCP-NEXT:    vpor %xmm8, %xmm9, %xmm8
2370; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
2371; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,6,12]
2372; AVX512BW-FCP-NEXT:    vpor %xmm9, %xmm10, %xmm9
2373; AVX512BW-FCP-NEXT:    vmovdqu8 %xmm9, %xmm8 {%k2}
2374; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15,u,u,u,u,u]
2375; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[u,u,u,u,u]
2376; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm4, %xmm4
2377; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
2378; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,7,13]
2379; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm9, %xmm6
2380; AVX512BW-FCP-NEXT:    vmovdqu8 %xmm6, %xmm4 {%k2}
2381; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14]
2382; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm9
2383; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
2384; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
2385; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
2386; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm9[5,6,7]
2387; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128]
2388; AVX512BW-FCP-NEXT:    vpshufb %xmm9, %xmm7, %xmm10
2389; AVX512BW-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm9
2390; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7]
2391; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm9, %xmm6
2392; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15]
2393; AVX512BW-FCP-NEXT:    vpshufb %xmm9, %xmm5, %xmm5
2394; AVX512BW-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
2395; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
2396; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128]
2397; AVX512BW-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm7
2398; AVX512BW-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
2399; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7]
2400; AVX512BW-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
2401; AVX512BW-FCP-NEXT:    vmovdqa %xmm3, (%rsi)
2402; AVX512BW-FCP-NEXT:    vmovdqa %xmm2, (%rdx)
2403; AVX512BW-FCP-NEXT:    vmovdqa %xmm8, (%rcx)
2404; AVX512BW-FCP-NEXT:    vmovdqa %xmm4, (%r8)
2405; AVX512BW-FCP-NEXT:    vmovdqa %xmm6, (%r9)
2406; AVX512BW-FCP-NEXT:    vmovdqa %xmm0, (%rax)
2407; AVX512BW-FCP-NEXT:    vzeroupper
2408; AVX512BW-FCP-NEXT:    retq
2409;
2410; AVX512DQ-BW-LABEL: load_i8_stride6_vf16:
2411; AVX512DQ-BW:       # %bb.0:
2412; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2413; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm1
2414; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm0
2415; AVX512DQ-BW-NEXT:    movw $18724, %r10w # imm = 0x4924
2416; AVX512DQ-BW-NEXT:    kmovd %r10d, %k1
2417; AVX512DQ-BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm2 {%k1}
2418; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[u,u,u,u,u]
2419; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm4
2420; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm4[2,8,14],zero,zero,xmm4[0,6,12,u,u,u,u,u]
2421; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm5, %xmm3
2422; AVX512DQ-BW-NEXT:    vmovdqa 80(%rdi), %xmm5
2423; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[4,10]
2424; AVX512DQ-BW-NEXT:    vmovdqa 64(%rdi), %xmm7
2425; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
2426; AVX512DQ-BW-NEXT:    vpor %xmm6, %xmm8, %xmm6
2427; AVX512DQ-BW-NEXT:    movw $-2048, %di # imm = 0xF800
2428; AVX512DQ-BW-NEXT:    kmovd %edi, %k2
2429; AVX512DQ-BW-NEXT:    vmovdqu8 %xmm6, %xmm3 {%k2}
2430; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[u,u,u,u,u]
2431; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[3,9,15],zero,zero,xmm4[1,7,13,u,u,u,u,u]
2432; AVX512DQ-BW-NEXT:    vpor %xmm2, %xmm4, %xmm2
2433; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,11]
2434; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
2435; AVX512DQ-BW-NEXT:    vpor %xmm4, %xmm6, %xmm4
2436; AVX512DQ-BW-NEXT:    vmovdqu8 %xmm4, %xmm2 {%k2}
2437; AVX512DQ-BW-NEXT:    movw $9362, %di # imm = 0x2492
2438; AVX512DQ-BW-NEXT:    kmovd %edi, %k3
2439; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm0, %ymm4 {%k3}
2440; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm4, %xmm6
2441; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14,u,u,u,u,u]
2442; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm4[2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[u,u,u,u,u]
2443; AVX512DQ-BW-NEXT:    vpor %xmm8, %xmm9, %xmm8
2444; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
2445; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,6,12]
2446; AVX512DQ-BW-NEXT:    vpor %xmm9, %xmm10, %xmm9
2447; AVX512DQ-BW-NEXT:    vmovdqu8 %xmm9, %xmm8 {%k2}
2448; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15,u,u,u,u,u]
2449; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[u,u,u,u,u]
2450; AVX512DQ-BW-NEXT:    vpor %xmm6, %xmm4, %xmm4
2451; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
2452; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,7,13]
2453; AVX512DQ-BW-NEXT:    vpor %xmm6, %xmm9, %xmm6
2454; AVX512DQ-BW-NEXT:    vmovdqu8 %xmm6, %xmm4 {%k2}
2455; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14]
2456; AVX512DQ-BW-NEXT:    vpshufb %xmm6, %xmm5, %xmm9
2457; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
2458; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
2459; AVX512DQ-BW-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
2460; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm9[5,6,7]
2461; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm9 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128]
2462; AVX512DQ-BW-NEXT:    vpshufb %xmm9, %xmm7, %xmm10
2463; AVX512DQ-BW-NEXT:    vpshufb %xmm9, %xmm0, %xmm9
2464; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7]
2465; AVX512DQ-BW-NEXT:    vpor %xmm6, %xmm9, %xmm6
2466; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm9 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15]
2467; AVX512DQ-BW-NEXT:    vpshufb %xmm9, %xmm5, %xmm5
2468; AVX512DQ-BW-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
2469; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
2470; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm5 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128]
2471; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm7, %xmm7
2472; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
2473; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7]
2474; AVX512DQ-BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
2475; AVX512DQ-BW-NEXT:    vmovdqa %xmm3, (%rsi)
2476; AVX512DQ-BW-NEXT:    vmovdqa %xmm2, (%rdx)
2477; AVX512DQ-BW-NEXT:    vmovdqa %xmm8, (%rcx)
2478; AVX512DQ-BW-NEXT:    vmovdqa %xmm4, (%r8)
2479; AVX512DQ-BW-NEXT:    vmovdqa %xmm6, (%r9)
2480; AVX512DQ-BW-NEXT:    vmovdqa %xmm0, (%rax)
2481; AVX512DQ-BW-NEXT:    vzeroupper
2482; AVX512DQ-BW-NEXT:    retq
2483;
2484; AVX512DQ-BW-FCP-LABEL: load_i8_stride6_vf16:
2485; AVX512DQ-BW-FCP:       # %bb.0:
2486; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2487; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
2488; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
2489; AVX512DQ-BW-FCP-NEXT:    movw $18724, %r10w # imm = 0x4924
2490; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k1
2491; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm2 {%k1}
2492; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[u,u,u,u,u]
2493; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
2494; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm4[2,8,14],zero,zero,xmm4[0,6,12,u,u,u,u,u]
2495; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm5, %xmm3
2496; AVX512DQ-BW-FCP-NEXT:    vmovdqa 80(%rdi), %xmm5
2497; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[4,10]
2498; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%rdi), %xmm7
2499; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
2500; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm8, %xmm6
2501; AVX512DQ-BW-FCP-NEXT:    movw $-2048, %di # imm = 0xF800
2502; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k2
2503; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %xmm6, %xmm3 {%k2}
2504; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[u,u,u,u,u]
2505; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[3,9,15],zero,zero,xmm4[1,7,13,u,u,u,u,u]
2506; AVX512DQ-BW-FCP-NEXT:    vpor %xmm2, %xmm4, %xmm2
2507; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,11]
2508; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
2509; AVX512DQ-BW-FCP-NEXT:    vpor %xmm4, %xmm6, %xmm4
2510; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %xmm4, %xmm2 {%k2}
2511; AVX512DQ-BW-FCP-NEXT:    movw $9362, %di # imm = 0x2492
2512; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k3
2513; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm1, %ymm0, %ymm4 {%k3}
2514; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm6
2515; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14,u,u,u,u,u]
2516; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm4[2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[u,u,u,u,u]
2517; AVX512DQ-BW-FCP-NEXT:    vpor %xmm8, %xmm9, %xmm8
2518; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
2519; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,6,12]
2520; AVX512DQ-BW-FCP-NEXT:    vpor %xmm9, %xmm10, %xmm9
2521; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %xmm9, %xmm8 {%k2}
2522; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15,u,u,u,u,u]
2523; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[u,u,u,u,u]
2524; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm4, %xmm4
2525; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
2526; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,7,13]
2527; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm9, %xmm6
2528; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %xmm6, %xmm4 {%k2}
2529; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,0,6,12,128,128,128,4,10,128,128,128,2,8,14]
2530; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm9
2531; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
2532; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
2533; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
2534; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm9[5,6,7]
2535; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [4,10,128,128,128,2,8,14,128,128,0,6,12,128,128,128]
2536; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm9, %xmm7, %xmm10
2537; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm9, %xmm0, %xmm9
2538; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7]
2539; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm9, %xmm6
2540; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [128,128,1,7,13,128,128,128,5,11,128,128,128,3,9,15]
2541; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm9, %xmm5, %xmm5
2542; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
2543; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
2544; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [5,11,128,128,128,3,9,15,128,128,1,7,13,128,128,128]
2545; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm7
2546; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
2547; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7]
2548; AVX512DQ-BW-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
2549; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm3, (%rsi)
2550; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm2, (%rdx)
2551; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm8, (%rcx)
2552; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm4, (%r8)
2553; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm6, (%r9)
2554; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm0, (%rax)
2555; AVX512DQ-BW-FCP-NEXT:    vzeroupper
2556; AVX512DQ-BW-FCP-NEXT:    retq
2557  %wide.vec = load <96 x i8>, ptr %in.vec, align 64
2558  %strided.vec0 = shufflevector <96 x i8> %wide.vec, <96 x i8> poison, <16 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90>
2559  %strided.vec1 = shufflevector <96 x i8> %wide.vec, <96 x i8> poison, <16 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43, i32 49, i32 55, i32 61, i32 67, i32 73, i32 79, i32 85, i32 91>
2560  %strided.vec2 = shufflevector <96 x i8> %wide.vec, <96 x i8> poison, <16 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44, i32 50, i32 56, i32 62, i32 68, i32 74, i32 80, i32 86, i32 92>
2561  %strided.vec3 = shufflevector <96 x i8> %wide.vec, <96 x i8> poison, <16 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45, i32 51, i32 57, i32 63, i32 69, i32 75, i32 81, i32 87, i32 93>
2562  %strided.vec4 = shufflevector <96 x i8> %wide.vec, <96 x i8> poison, <16 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46, i32 52, i32 58, i32 64, i32 70, i32 76, i32 82, i32 88, i32 94>
2563  %strided.vec5 = shufflevector <96 x i8> %wide.vec, <96 x i8> poison, <16 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47, i32 53, i32 59, i32 65, i32 71, i32 77, i32 83, i32 89, i32 95>
2564  store <16 x i8> %strided.vec0, ptr %out.vec0, align 64
2565  store <16 x i8> %strided.vec1, ptr %out.vec1, align 64
2566  store <16 x i8> %strided.vec2, ptr %out.vec2, align 64
2567  store <16 x i8> %strided.vec3, ptr %out.vec3, align 64
2568  store <16 x i8> %strided.vec4, ptr %out.vec4, align 64
2569  store <16 x i8> %strided.vec5, ptr %out.vec5, align 64
2570  ret void
2571}
2572
2573define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
2574; SSE-LABEL: load_i8_stride6_vf32:
2575; SSE:       # %bb.0:
2576; SSE-NEXT:    subq $264, %rsp # imm = 0x108
2577; SSE-NEXT:    movdqa 64(%rdi), %xmm7
2578; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2579; SSE-NEXT:    movdqa 80(%rdi), %xmm9
2580; SSE-NEXT:    movdqa (%rdi), %xmm12
2581; SSE-NEXT:    movdqa 16(%rdi), %xmm14
2582; SSE-NEXT:    movdqa 32(%rdi), %xmm1
2583; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2584; SSE-NEXT:    movdqa 48(%rdi), %xmm15
2585; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,0,65535,65535]
2586; SSE-NEXT:    movdqa %xmm10, %xmm0
2587; SSE-NEXT:    pandn %xmm1, %xmm0
2588; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,0,65535,65535,0]
2589; SSE-NEXT:    movdqa %xmm11, %xmm1
2590; SSE-NEXT:    pandn %xmm15, %xmm1
2591; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2592; SSE-NEXT:    movdqa %xmm10, %xmm1
2593; SSE-NEXT:    pandn %xmm15, %xmm1
2594; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2595; SSE-NEXT:    pand %xmm10, %xmm15
2596; SSE-NEXT:    por %xmm0, %xmm15
2597; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
2598; SSE-NEXT:    movdqa %xmm15, %xmm0
2599; SSE-NEXT:    pand %xmm6, %xmm0
2600; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
2601; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
2602; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
2603; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
2604; SSE-NEXT:    packuswb %xmm1, %xmm0
2605; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535]
2606; SSE-NEXT:    movdqa %xmm11, %xmm1
2607; SSE-NEXT:    pandn %xmm14, %xmm1
2608; SSE-NEXT:    movdqa %xmm12, %xmm8
2609; SSE-NEXT:    pand %xmm11, %xmm8
2610; SSE-NEXT:    por %xmm1, %xmm8
2611; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[0,2,1,3]
2612; SSE-NEXT:    pand %xmm6, %xmm1
2613; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
2614; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
2615; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
2616; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
2617; SSE-NEXT:    packuswb %xmm1, %xmm1
2618; SSE-NEXT:    pand %xmm2, %xmm1
2619; SSE-NEXT:    movdqa %xmm2, %xmm3
2620; SSE-NEXT:    movdqa %xmm2, %xmm5
2621; SSE-NEXT:    pandn %xmm0, %xmm3
2622; SSE-NEXT:    por %xmm3, %xmm1
2623; SSE-NEXT:    movdqa %xmm10, %xmm0
2624; SSE-NEXT:    pandn %xmm9, %xmm0
2625; SSE-NEXT:    pand %xmm10, %xmm7
2626; SSE-NEXT:    por %xmm0, %xmm7
2627; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[3,1,2,0]
2628; SSE-NEXT:    pand %xmm6, %xmm0
2629; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
2630; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0]
2631; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
2632; SSE-NEXT:    packuswb %xmm0, %xmm0
2633; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
2634; SSE-NEXT:    movdqa %xmm3, %xmm2
2635; SSE-NEXT:    pandn %xmm0, %xmm2
2636; SSE-NEXT:    pand %xmm3, %xmm1
2637; SSE-NEXT:    por %xmm1, %xmm2
2638; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2639; SSE-NEXT:    movdqa 128(%rdi), %xmm1
2640; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2641; SSE-NEXT:    movdqa %xmm10, %xmm0
2642; SSE-NEXT:    pandn %xmm1, %xmm0
2643; SSE-NEXT:    movdqa 144(%rdi), %xmm1
2644; SSE-NEXT:    movdqa %xmm11, %xmm2
2645; SSE-NEXT:    pandn %xmm1, %xmm2
2646; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2647; SSE-NEXT:    movdqa %xmm10, %xmm2
2648; SSE-NEXT:    pandn %xmm1, %xmm2
2649; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2650; SSE-NEXT:    movdqa %xmm1, %xmm2
2651; SSE-NEXT:    pand %xmm10, %xmm2
2652; SSE-NEXT:    por %xmm0, %xmm2
2653; SSE-NEXT:    movdqa %xmm2, %xmm0
2654; SSE-NEXT:    pand %xmm6, %xmm0
2655; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7]
2656; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
2657; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
2658; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
2659; SSE-NEXT:    packuswb %xmm3, %xmm0
2660; SSE-NEXT:    movdqa %xmm5, %xmm6
2661; SSE-NEXT:    pandn %xmm0, %xmm6
2662; SSE-NEXT:    movdqa %xmm10, %xmm1
2663; SSE-NEXT:    movdqa %xmm10, %xmm0
2664; SSE-NEXT:    pandn %xmm12, %xmm0
2665; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2666; SSE-NEXT:    movdqa 112(%rdi), %xmm0
2667; SSE-NEXT:    movdqa %xmm11, %xmm3
2668; SSE-NEXT:    pandn %xmm0, %xmm3
2669; SSE-NEXT:    movdqa 160(%rdi), %xmm5
2670; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2671; SSE-NEXT:    pand %xmm10, %xmm5
2672; SSE-NEXT:    movdqa %xmm10, %xmm4
2673; SSE-NEXT:    pandn %xmm14, %xmm4
2674; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2675; SSE-NEXT:    pand %xmm10, %xmm12
2676; SSE-NEXT:    movdqa %xmm11, %xmm4
2677; SSE-NEXT:    pandn %xmm9, %xmm4
2678; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2679; SSE-NEXT:    movdqa %xmm9, %xmm11
2680; SSE-NEXT:    pand %xmm10, %xmm11
2681; SSE-NEXT:    movdqa %xmm10, %xmm4
2682; SSE-NEXT:    pandn %xmm0, %xmm4
2683; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2684; SSE-NEXT:    movdqa 96(%rdi), %xmm13
2685; SSE-NEXT:    movdqa %xmm13, %xmm4
2686; SSE-NEXT:    pand %xmm10, %xmm4
2687; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2688; SSE-NEXT:    movdqa 176(%rdi), %xmm4
2689; SSE-NEXT:    movdqa %xmm4, %xmm10
2690; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2691; SSE-NEXT:    pand %xmm1, %xmm10
2692; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2693; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
2694; SSE-NEXT:    movdqa %xmm9, %xmm10
2695; SSE-NEXT:    pand %xmm1, %xmm9
2696; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2697; SSE-NEXT:    pand %xmm1, %xmm14
2698; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2699; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2700; SSE-NEXT:    movdqa %xmm14, %xmm9
2701; SSE-NEXT:    pand %xmm1, %xmm14
2702; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2703; SSE-NEXT:    pand %xmm1, %xmm0
2704; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2705; SSE-NEXT:    movdqa %xmm1, %xmm14
2706; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2707; SSE-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
2708; SSE-NEXT:    pandn %xmm13, %xmm1
2709; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2710; SSE-NEXT:    movdqa %xmm13, %xmm1
2711; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2712; SSE-NEXT:    por %xmm3, %xmm1
2713; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,2,1,3]
2714; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
2715; SSE-NEXT:    pand %xmm0, %xmm3
2716; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
2717; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3]
2718; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7]
2719; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
2720; SSE-NEXT:    packuswb %xmm3, %xmm3
2721; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2722; SSE-NEXT:    por %xmm6, %xmm3
2723; SSE-NEXT:    pandn %xmm4, %xmm14
2724; SSE-NEXT:    por %xmm14, %xmm5
2725; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[3,1,2,0]
2726; SSE-NEXT:    pand %xmm0, %xmm4
2727; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
2728; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0]
2729; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5]
2730; SSE-NEXT:    packuswb %xmm4, %xmm4
2731; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
2732; SSE-NEXT:    movdqa %xmm13, %xmm0
2733; SSE-NEXT:    pandn %xmm4, %xmm0
2734; SSE-NEXT:    pand %xmm13, %xmm3
2735; SSE-NEXT:    por %xmm3, %xmm0
2736; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2737; SSE-NEXT:    pxor %xmm4, %xmm4
2738; SSE-NEXT:    movdqa %xmm15, %xmm3
2739; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
2740; SSE-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3],xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7]
2741; SSE-NEXT:    pshufd {{.*#+}} xmm14 = xmm15[2,2,3,3]
2742; SSE-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3]
2743; SSE-NEXT:    psrld $16, %xmm3
2744; SSE-NEXT:    pshufd {{.*#+}} xmm15 = xmm15[0,1,0,3]
2745; SSE-NEXT:    pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,7]
2746; SSE-NEXT:    punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm3[2],xmm15[3],xmm3[3]
2747; SSE-NEXT:    packuswb %xmm15, %xmm14
2748; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,0,65535,65535]
2749; SSE-NEXT:    movdqa %xmm6, %xmm3
2750; SSE-NEXT:    pandn %xmm14, %xmm3
2751; SSE-NEXT:    movdqa %xmm8, %xmm14
2752; SSE-NEXT:    punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm4[8],xmm14[9],xmm4[9],xmm14[10],xmm4[10],xmm14[11],xmm4[11],xmm14[12],xmm4[12],xmm14[13],xmm4[13],xmm14[14],xmm4[14],xmm14[15],xmm4[15]
2753; SSE-NEXT:    pshufd {{.*#+}} xmm14 = xmm14[2,1,0,3]
2754; SSE-NEXT:    pshuflw {{.*#+}} xmm14 = xmm14[1,1,1,1,4,5,6,7]
2755; SSE-NEXT:    pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,7,6,7]
2756; SSE-NEXT:    movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,0,0,65535,65535]
2757; SSE-NEXT:    movdqa %xmm15, %xmm0
2758; SSE-NEXT:    pandn %xmm14, %xmm0
2759; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
2760; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7]
2761; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
2762; SSE-NEXT:    pshuflw {{.*#+}} xmm14 = xmm8[1,3,2,0,4,5,6,7]
2763; SSE-NEXT:    pand %xmm15, %xmm14
2764; SSE-NEXT:    por %xmm0, %xmm14
2765; SSE-NEXT:    packuswb %xmm14, %xmm14
2766; SSE-NEXT:    pand %xmm6, %xmm14
2767; SSE-NEXT:    por %xmm3, %xmm14
2768; SSE-NEXT:    movdqa %xmm7, %xmm0
2769; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
2770; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
2771; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,65535,0,65535]
2772; SSE-NEXT:    movdqa %xmm8, %xmm3
2773; SSE-NEXT:    pandn %xmm0, %xmm3
2774; SSE-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
2775; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm7[3,1,2,3,4,5,6,7]
2776; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
2777; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,4]
2778; SSE-NEXT:    pand %xmm8, %xmm0
2779; SSE-NEXT:    por %xmm3, %xmm0
2780; SSE-NEXT:    packuswb %xmm0, %xmm0
2781; SSE-NEXT:    movdqa %xmm13, %xmm3
2782; SSE-NEXT:    pandn %xmm0, %xmm3
2783; SSE-NEXT:    pand %xmm13, %xmm14
2784; SSE-NEXT:    por %xmm14, %xmm3
2785; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2786; SSE-NEXT:    movdqa %xmm2, %xmm0
2787; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
2788; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
2789; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,2,3,3]
2790; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
2791; SSE-NEXT:    psrld $16, %xmm0
2792; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
2793; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7]
2794; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
2795; SSE-NEXT:    packuswb %xmm2, %xmm3
2796; SSE-NEXT:    movdqa %xmm1, %xmm0
2797; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
2798; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
2799; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
2800; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
2801; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
2802; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
2803; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
2804; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,0,4,5,6,7]
2805; SSE-NEXT:    pand %xmm15, %xmm1
2806; SSE-NEXT:    pandn %xmm0, %xmm15
2807; SSE-NEXT:    por %xmm1, %xmm15
2808; SSE-NEXT:    packuswb %xmm15, %xmm15
2809; SSE-NEXT:    pand %xmm6, %xmm15
2810; SSE-NEXT:    pandn %xmm3, %xmm6
2811; SSE-NEXT:    por %xmm6, %xmm15
2812; SSE-NEXT:    movdqa %xmm5, %xmm0
2813; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
2814; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
2815; SSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
2816; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm5[3,1,2,3,4,5,6,7]
2817; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
2818; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4]
2819; SSE-NEXT:    pand %xmm8, %xmm1
2820; SSE-NEXT:    pandn %xmm0, %xmm8
2821; SSE-NEXT:    por %xmm1, %xmm8
2822; SSE-NEXT:    packuswb %xmm8, %xmm0
2823; SSE-NEXT:    movdqa %xmm13, %xmm1
2824; SSE-NEXT:    pandn %xmm0, %xmm1
2825; SSE-NEXT:    pand %xmm13, %xmm15
2826; SSE-NEXT:    movdqa %xmm13, %xmm7
2827; SSE-NEXT:    por %xmm15, %xmm1
2828; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2829; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0]
2830; SSE-NEXT:    pand %xmm5, %xmm10
2831; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
2832; SSE-NEXT:    movdqa %xmm10, %xmm0
2833; SSE-NEXT:    movdqa {{.*#+}} xmm15 = [255,255,255,255,255,255,255,255]
2834; SSE-NEXT:    pand %xmm15, %xmm0
2835; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7]
2836; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2837; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7]
2838; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
2839; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
2840; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,5,6]
2841; SSE-NEXT:    packuswb %xmm1, %xmm2
2842; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
2843; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm12[2,1,2,3,4,5,6,7]
2844; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
2845; SSE-NEXT:    pand %xmm15, %xmm0
2846; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
2847; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
2848; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,5,5,5]
2849; SSE-NEXT:    packuswb %xmm1, %xmm1
2850; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
2851; SSE-NEXT:    movdqa %xmm0, %xmm3
2852; SSE-NEXT:    pandn %xmm1, %xmm3
2853; SSE-NEXT:    pand %xmm0, %xmm2
2854; SSE-NEXT:    por %xmm2, %xmm3
2855; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2856; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2857; SSE-NEXT:    pandn %xmm14, %xmm1
2858; SSE-NEXT:    por %xmm1, %xmm11
2859; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm11[0,3,2,3,4,5,6,7]
2860; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
2861; SSE-NEXT:    pand %xmm15, %xmm1
2862; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
2863; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7]
2864; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4]
2865; SSE-NEXT:    packuswb %xmm1, %xmm1
2866; SSE-NEXT:    movdqa %xmm13, %xmm2
2867; SSE-NEXT:    pandn %xmm1, %xmm2
2868; SSE-NEXT:    pand %xmm13, %xmm3
2869; SSE-NEXT:    por %xmm3, %xmm2
2870; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2871; SSE-NEXT:    pand %xmm5, %xmm9
2872; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
2873; SSE-NEXT:    movdqa %xmm9, %xmm1
2874; SSE-NEXT:    pand %xmm15, %xmm1
2875; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,7,6,7]
2876; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2877; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7]
2878; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
2879; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
2880; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6]
2881; SSE-NEXT:    packuswb %xmm2, %xmm1
2882; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
2883; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
2884; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm13[2,1,2,3,4,5,6,7]
2885; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
2886; SSE-NEXT:    pand %xmm15, %xmm2
2887; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
2888; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
2889; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
2890; SSE-NEXT:    packuswb %xmm2, %xmm2
2891; SSE-NEXT:    movdqa %xmm0, %xmm3
2892; SSE-NEXT:    pandn %xmm2, %xmm3
2893; SSE-NEXT:    pand %xmm0, %xmm1
2894; SSE-NEXT:    por %xmm1, %xmm3
2895; SSE-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
2896; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2897; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2898; SSE-NEXT:    por %xmm1, %xmm8
2899; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm8[0,3,2,3,4,5,6,7]
2900; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
2901; SSE-NEXT:    pand %xmm15, %xmm1
2902; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
2903; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7]
2904; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4]
2905; SSE-NEXT:    packuswb %xmm1, %xmm1
2906; SSE-NEXT:    movdqa %xmm7, %xmm2
2907; SSE-NEXT:    pandn %xmm1, %xmm2
2908; SSE-NEXT:    pand %xmm7, %xmm3
2909; SSE-NEXT:    por %xmm3, %xmm2
2910; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2911; SSE-NEXT:    movdqa %xmm10, %xmm1
2912; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
2913; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
2914; SSE-NEXT:    movdqa %xmm10, %xmm2
2915; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0]
2916; SSE-NEXT:    movaps %xmm1, %xmm3
2917; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2]
2918; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm10[0,0]
2919; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[2,3]
2920; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7]
2921; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
2922; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
2923; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
2924; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
2925; SSE-NEXT:    packuswb %xmm1, %xmm2
2926; SSE-NEXT:    movdqa %xmm12, %xmm1
2927; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
2928; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
2929; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[3,1,2,1,4,5,6,7]
2930; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,65535,65535]
2931; SSE-NEXT:    movdqa %xmm1, %xmm5
2932; SSE-NEXT:    pandn %xmm3, %xmm5
2933; SSE-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm4[8],xmm12[9],xmm4[9],xmm12[10],xmm4[10],xmm12[11],xmm4[11],xmm12[12],xmm4[12],xmm12[13],xmm4[13],xmm12[14],xmm4[14],xmm12[15],xmm4[15]
2934; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm12[0,3,2,1]
2935; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7]
2936; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
2937; SSE-NEXT:    pand %xmm1, %xmm3
2938; SSE-NEXT:    por %xmm5, %xmm3
2939; SSE-NEXT:    packuswb %xmm3, %xmm3
2940; SSE-NEXT:    movdqa %xmm0, %xmm5
2941; SSE-NEXT:    pandn %xmm3, %xmm5
2942; SSE-NEXT:    pand %xmm0, %xmm2
2943; SSE-NEXT:    por %xmm2, %xmm5
2944; SSE-NEXT:    movdqa %xmm11, %xmm2
2945; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
2946; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
2947; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,7,5,6,5]
2948; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,0]
2949; SSE-NEXT:    movdqa %xmm2, %xmm6
2950; SSE-NEXT:    pandn %xmm3, %xmm6
2951; SSE-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
2952; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm11[0,2,0,3]
2953; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
2954; SSE-NEXT:    pand %xmm2, %xmm3
2955; SSE-NEXT:    por %xmm6, %xmm3
2956; SSE-NEXT:    packuswb %xmm3, %xmm3
2957; SSE-NEXT:    movdqa %xmm7, %xmm6
2958; SSE-NEXT:    pandn %xmm3, %xmm6
2959; SSE-NEXT:    pand %xmm7, %xmm5
2960; SSE-NEXT:    por %xmm5, %xmm6
2961; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2962; SSE-NEXT:    movdqa %xmm9, %xmm3
2963; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
2964; SSE-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15]
2965; SSE-NEXT:    movdqa %xmm9, %xmm5
2966; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[3,0]
2967; SSE-NEXT:    movaps %xmm3, %xmm6
2968; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,2]
2969; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm9[0,0]
2970; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm9[2,3]
2971; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,7,5,6,7]
2972; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,1,0,2]
2973; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
2974; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
2975; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7]
2976; SSE-NEXT:    packuswb %xmm3, %xmm5
2977; SSE-NEXT:    movdqa %xmm13, %xmm3
2978; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
2979; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
2980; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7]
2981; SSE-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm4[8],xmm13[9],xmm4[9],xmm13[10],xmm4[10],xmm13[11],xmm4[11],xmm13[12],xmm4[12],xmm13[13],xmm4[13],xmm13[14],xmm4[14],xmm13[15],xmm4[15]
2982; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm13[0,3,2,1]
2983; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7]
2984; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
2985; SSE-NEXT:    pand %xmm1, %xmm6
2986; SSE-NEXT:    pandn %xmm3, %xmm1
2987; SSE-NEXT:    por %xmm6, %xmm1
2988; SSE-NEXT:    pand %xmm0, %xmm5
2989; SSE-NEXT:    packuswb %xmm1, %xmm1
2990; SSE-NEXT:    pandn %xmm1, %xmm0
2991; SSE-NEXT:    por %xmm5, %xmm0
2992; SSE-NEXT:    movdqa %xmm8, %xmm1
2993; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
2994; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
2995; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5]
2996; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
2997; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm8[0,2,0,3]
2998; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
2999; SSE-NEXT:    pand %xmm2, %xmm3
3000; SSE-NEXT:    pandn %xmm1, %xmm2
3001; SSE-NEXT:    por %xmm3, %xmm2
3002; SSE-NEXT:    movdqa %xmm7, %xmm13
3003; SSE-NEXT:    pand %xmm7, %xmm0
3004; SSE-NEXT:    packuswb %xmm2, %xmm1
3005; SSE-NEXT:    pandn %xmm1, %xmm13
3006; SSE-NEXT:    por %xmm0, %xmm13
3007; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3008; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
3009; SSE-NEXT:    movdqa %xmm7, %xmm0
3010; SSE-NEXT:    pand %xmm15, %xmm0
3011; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3]
3012; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
3013; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
3014; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
3015; SSE-NEXT:    packuswb %xmm1, %xmm0
3016; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255]
3017; SSE-NEXT:    movdqa %xmm2, %xmm1
3018; SSE-NEXT:    pandn %xmm0, %xmm1
3019; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3020; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
3021; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[3,1,2,0]
3022; SSE-NEXT:    pand %xmm15, %xmm0
3023; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
3024; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
3025; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm0[2,1,0,3,4,5,6,7]
3026; SSE-NEXT:    packuswb %xmm3, %xmm3
3027; SSE-NEXT:    pand %xmm2, %xmm3
3028; SSE-NEXT:    por %xmm1, %xmm3
3029; SSE-NEXT:    movdqa %xmm14, %xmm11
3030; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0]
3031; SSE-NEXT:    pand %xmm12, %xmm11
3032; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
3033; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[0,2,1,3]
3034; SSE-NEXT:    pand %xmm15, %xmm0
3035; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
3036; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
3037; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
3038; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
3039; SSE-NEXT:    packuswb %xmm0, %xmm5
3040; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
3041; SSE-NEXT:    movdqa %xmm0, %xmm8
3042; SSE-NEXT:    pandn %xmm5, %xmm8
3043; SSE-NEXT:    pand %xmm0, %xmm3
3044; SSE-NEXT:    por %xmm3, %xmm8
3045; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3046; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
3047; SSE-NEXT:    movdqa %xmm14, %xmm3
3048; SSE-NEXT:    pand %xmm15, %xmm3
3049; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[2,1,2,3]
3050; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
3051; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
3052; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
3053; SSE-NEXT:    packuswb %xmm5, %xmm3
3054; SSE-NEXT:    movdqa %xmm2, %xmm5
3055; SSE-NEXT:    pandn %xmm3, %xmm5
3056; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3057; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3058; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[3,1,2,0]
3059; SSE-NEXT:    pand %xmm15, %xmm3
3060; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
3061; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3]
3062; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm3[2,1,0,3,4,5,6,7]
3063; SSE-NEXT:    packuswb %xmm6, %xmm6
3064; SSE-NEXT:    pand %xmm2, %xmm6
3065; SSE-NEXT:    por %xmm5, %xmm6
3066; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3067; SSE-NEXT:    pand %xmm12, %xmm3
3068; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
3069; SSE-NEXT:    por %xmm3, %xmm12
3070; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm12[0,2,1,3]
3071; SSE-NEXT:    pand %xmm15, %xmm3
3072; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7]
3073; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3]
3074; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7]
3075; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
3076; SSE-NEXT:    packuswb %xmm3, %xmm5
3077; SSE-NEXT:    movdqa %xmm0, %xmm3
3078; SSE-NEXT:    pandn %xmm5, %xmm3
3079; SSE-NEXT:    pand %xmm0, %xmm6
3080; SSE-NEXT:    por %xmm6, %xmm3
3081; SSE-NEXT:    movdqa %xmm7, %xmm5
3082; SSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
3083; SSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
3084; SSE-NEXT:    movdqa %xmm7, %xmm6
3085; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,0],xmm5[0,0]
3086; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[2,3]
3087; SSE-NEXT:    psrlq $48, %xmm5
3088; SSE-NEXT:    psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3089; SSE-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
3090; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm7[3,1,2,3,4,5,6,7]
3091; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
3092; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,7]
3093; SSE-NEXT:    packuswb %xmm6, %xmm5
3094; SSE-NEXT:    movdqa %xmm2, %xmm6
3095; SSE-NEXT:    pandn %xmm5, %xmm6
3096; SSE-NEXT:    movdqa %xmm9, %xmm5
3097; SSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
3098; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
3099; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,5,5,5,5]
3100; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535]
3101; SSE-NEXT:    movdqa %xmm5, %xmm10
3102; SSE-NEXT:    pandn %xmm7, %xmm10
3103; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
3104; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,7,5,6,7]
3105; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
3106; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm7[3,1,1,2,4,5,6,7]
3107; SSE-NEXT:    pand %xmm5, %xmm9
3108; SSE-NEXT:    por %xmm10, %xmm9
3109; SSE-NEXT:    packuswb %xmm9, %xmm9
3110; SSE-NEXT:    pand %xmm2, %xmm9
3111; SSE-NEXT:    por %xmm6, %xmm9
3112; SSE-NEXT:    movdqa %xmm11, %xmm6
3113; SSE-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm4[8],xmm11[9],xmm4[9],xmm11[10],xmm4[10],xmm11[11],xmm4[11],xmm11[12],xmm4[12],xmm11[13],xmm4[13],xmm11[14],xmm4[14],xmm11[15],xmm4[15]
3114; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm11[0,1,2,3,7,5,6,7]
3115; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,1,2,0]
3116; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,5,5,7,4]
3117; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,0]
3118; SSE-NEXT:    movdqa %xmm7, %xmm11
3119; SSE-NEXT:    pandn %xmm10, %xmm11
3120; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
3121; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,3,1,1]
3122; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
3123; SSE-NEXT:    pand %xmm7, %xmm6
3124; SSE-NEXT:    por %xmm6, %xmm11
3125; SSE-NEXT:    packuswb %xmm11, %xmm10
3126; SSE-NEXT:    movdqa %xmm0, %xmm6
3127; SSE-NEXT:    pandn %xmm10, %xmm6
3128; SSE-NEXT:    pand %xmm0, %xmm9
3129; SSE-NEXT:    por %xmm9, %xmm6
3130; SSE-NEXT:    movdqa %xmm14, %xmm11
3131; SSE-NEXT:    movdqa %xmm14, %xmm9
3132; SSE-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15]
3133; SSE-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
3134; SSE-NEXT:    movdqa %xmm11, %xmm10
3135; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[1,0],xmm9[0,0]
3136; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[2,0],xmm9[2,3]
3137; SSE-NEXT:    psrlq $48, %xmm9
3138; SSE-NEXT:    psrldq {{.*#+}} xmm10 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3139; SSE-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
3140; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm11[3,1,2,3,4,5,6,7]
3141; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3]
3142; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,7]
3143; SSE-NEXT:    packuswb %xmm10, %xmm9
3144; SSE-NEXT:    movdqa %xmm1, %xmm10
3145; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
3146; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3]
3147; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5]
3148; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
3149; SSE-NEXT:    pshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,7,5,6,7]
3150; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3]
3151; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm11[3,1,1,2,4,5,6,7]
3152; SSE-NEXT:    pand %xmm5, %xmm11
3153; SSE-NEXT:    pandn %xmm10, %xmm5
3154; SSE-NEXT:    por %xmm11, %xmm5
3155; SSE-NEXT:    packuswb %xmm5, %xmm5
3156; SSE-NEXT:    pand %xmm2, %xmm5
3157; SSE-NEXT:    pandn %xmm9, %xmm2
3158; SSE-NEXT:    por %xmm2, %xmm5
3159; SSE-NEXT:    movdqa %xmm12, %xmm2
3160; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
3161; SSE-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm4[8],xmm12[9],xmm4[9],xmm12[10],xmm4[10],xmm12[11],xmm4[11],xmm12[12],xmm4[12],xmm12[13],xmm4[13],xmm12[14],xmm4[14],xmm12[15],xmm4[15]
3162; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,1,1]
3163; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
3164; SSE-NEXT:    pand %xmm7, %xmm2
3165; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,7,5,6,7]
3166; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0]
3167; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,7,4]
3168; SSE-NEXT:    pandn %xmm4, %xmm7
3169; SSE-NEXT:    por %xmm2, %xmm7
3170; SSE-NEXT:    pand %xmm0, %xmm5
3171; SSE-NEXT:    packuswb %xmm7, %xmm2
3172; SSE-NEXT:    pandn %xmm2, %xmm0
3173; SSE-NEXT:    por %xmm5, %xmm0
3174; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3175; SSE-NEXT:    movaps %xmm2, 16(%rsi)
3176; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3177; SSE-NEXT:    movaps %xmm2, (%rsi)
3178; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3179; SSE-NEXT:    movaps %xmm1, 16(%rdx)
3180; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3181; SSE-NEXT:    movaps %xmm1, (%rdx)
3182; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3183; SSE-NEXT:    movaps %xmm1, 16(%rcx)
3184; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3185; SSE-NEXT:    movaps %xmm1, (%rcx)
3186; SSE-NEXT:    movdqa %xmm13, 16(%r8)
3187; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3188; SSE-NEXT:    movaps %xmm1, (%r8)
3189; SSE-NEXT:    movdqa %xmm3, 16(%r9)
3190; SSE-NEXT:    movdqa %xmm8, (%r9)
3191; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3192; SSE-NEXT:    movdqa %xmm0, 16(%rax)
3193; SSE-NEXT:    movdqa %xmm6, (%rax)
3194; SSE-NEXT:    addq $264, %rsp # imm = 0x108
3195; SSE-NEXT:    retq
3196;
3197; AVX-LABEL: load_i8_stride6_vf32:
3198; AVX:       # %bb.0:
3199; AVX-NEXT:    subq $120, %rsp
3200; AVX-NEXT:    vmovdqa (%rdi), %xmm9
3201; AVX-NEXT:    vmovdqa 16(%rdi), %xmm7
3202; AVX-NEXT:    vmovdqa 32(%rdi), %xmm6
3203; AVX-NEXT:    vmovdqa 48(%rdi), %xmm5
3204; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm5[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u]
3205; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,0,6,12,u,u,u,u,u,u,u,u]
3206; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
3207; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm7[4,10,u,u,u,u,u,u,u,u,u,u,u]
3208; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm9[2,8,14],zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u]
3209; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm2
3210; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u]
3211; AVX-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
3212; AVX-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3213; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u]
3214; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,1,7,13,u,u,u,u,u,u,u,u]
3215; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
3216; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm7[5,11,u,u,u,u,u,u,u,u,u,u,u]
3217; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm9[3,9,15],zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u]
3218; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
3219; AVX-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
3220; AVX-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3221; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u]
3222; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm7[0,6,12,u,u,u,u,u,u,u,u,u,u,u]
3223; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm1
3224; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm5[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u]
3225; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14]
3226; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
3227; AVX-NEXT:    vmovq {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
3228; AVX-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
3229; AVX-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3230; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u]
3231; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3232; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm7[1,7,13,u,u,u,u,u,u,u,u,u,u,u]
3233; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
3234; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3235; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u]
3236; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3237; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15]
3238; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
3239; AVX-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
3240; AVX-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3241; AVX-NEXT:    vmovq {{.*#+}} xmm8 = [128,128,128,2,8,14,0,0,0,0,0,0,0,0,0,0]
3242; AVX-NEXT:    vmovdqa 112(%rdi), %xmm0
3243; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3244; AVX-NEXT:    vpshufb %xmm8, %xmm0, %xmm1
3245; AVX-NEXT:    vmovq {{.*#+}} xmm2 = [0,6,12,128,128,128,0,0,0,0,0,0,0,0,0,0]
3246; AVX-NEXT:    vmovdqa 96(%rdi), %xmm13
3247; AVX-NEXT:    vpshufb %xmm2, %xmm13, %xmm3
3248; AVX-NEXT:    vpor %xmm1, %xmm3, %xmm1
3249; AVX-NEXT:    vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10]
3250; AVX-NEXT:    # xmm11 = mem[0,0]
3251; AVX-NEXT:    vmovdqa 80(%rdi), %xmm12
3252; AVX-NEXT:    vpshufb %xmm11, %xmm12, %xmm4
3253; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128]
3254; AVX-NEXT:    # xmm3 = mem[0,0]
3255; AVX-NEXT:    vmovdqa 64(%rdi), %xmm14
3256; AVX-NEXT:    vpshufb %xmm3, %xmm14, %xmm10
3257; AVX-NEXT:    vpor %xmm4, %xmm10, %xmm4
3258; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1
3259; AVX-NEXT:    vmovd {{.*#+}} xmm15 = [0,0,4,10,0,0,0,0,0,0,0,0,0,0,0,0]
3260; AVX-NEXT:    vpshufb %xmm15, %xmm6, %xmm4
3261; AVX-NEXT:    vbroadcastss {{.*#+}} xmm0 = [0,6,12,0,0,6,12,0,0,6,12,0,0,6,12,0]
3262; AVX-NEXT:    vpshufb %xmm0, %xmm5, %xmm10
3263; AVX-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1]
3264; AVX-NEXT:    vpshufb %xmm8, %xmm7, %xmm8
3265; AVX-NEXT:    vmovdqa %xmm7, %xmm10
3266; AVX-NEXT:    vpshufb %xmm2, %xmm9, %xmm2
3267; AVX-NEXT:    vpor %xmm2, %xmm8, %xmm2
3268; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5],xmm2[6,7]
3269; AVX-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
3270; AVX-NEXT:    vandnps %ymm1, %ymm4, %ymm1
3271; AVX-NEXT:    vandps %ymm4, %ymm2, %ymm2
3272; AVX-NEXT:    vorps %ymm1, %ymm2, %ymm8
3273; AVX-NEXT:    vmovdqa 128(%rdi), %xmm6
3274; AVX-NEXT:    vpshufb %xmm15, %xmm6, %xmm1
3275; AVX-NEXT:    vmovdqa 144(%rdi), %xmm5
3276; AVX-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
3277; AVX-NEXT:    vpunpckldq {{.*#+}} xmm15 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3278; AVX-NEXT:    vmovdqa 176(%rdi), %xmm4
3279; AVX-NEXT:    vpshufb %xmm11, %xmm4, %xmm11
3280; AVX-NEXT:    vmovdqa 160(%rdi), %xmm2
3281; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm3
3282; AVX-NEXT:    vpor %xmm3, %xmm11, %xmm11
3283; AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
3284; AVX-NEXT:    vpblendvb %xmm3, %xmm15, %xmm11, %xmm15
3285; AVX-NEXT:    vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
3286; AVX-NEXT:    vandps %ymm11, %ymm8, %ymm8
3287; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm15
3288; AVX-NEXT:    vandnps %ymm15, %ymm11, %ymm15
3289; AVX-NEXT:    vorps %ymm15, %ymm8, %ymm0
3290; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3291; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3292; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm9[3,9,15,u,u,u,u,u,u,u,u,u,u]
3293; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = xmm13[1,7,13],zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u]
3294; AVX-NEXT:    vpor %xmm0, %xmm15, %xmm1
3295; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[5,11]
3296; AVX-NEXT:    vmovdqa %xmm14, %xmm7
3297; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
3298; AVX-NEXT:    vpor %xmm0, %xmm15, %xmm0
3299; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3300; AVX-NEXT:    vmovd {{.*#+}} xmm8 = [0,0,5,11,0,0,0,0,0,0,0,0,0,0,0,0]
3301; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3302; AVX-NEXT:    vpshufb %xmm8, %xmm1, %xmm15
3303; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0]
3304; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3305; AVX-NEXT:    vpshufb %xmm1, %xmm14, %xmm14
3306; AVX-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1]
3307; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm10[3,9,15,u,u,u,u,u,u,u,u,u,u]
3308; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3309; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
3310; AVX-NEXT:    vpor %xmm15, %xmm10, %xmm10
3311; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3,4,5],xmm10[6,7]
3312; AVX-NEXT:    vmovaps {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
3313; AVX-NEXT:    vandnps %ymm0, %ymm14, %ymm0
3314; AVX-NEXT:    vandps %ymm14, %ymm10, %ymm10
3315; AVX-NEXT:    vorps %ymm0, %ymm10, %ymm0
3316; AVX-NEXT:    vpshufb %xmm8, %xmm6, %xmm8
3317; AVX-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
3318; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
3319; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[5,11]
3320; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
3321; AVX-NEXT:    vpor %xmm8, %xmm10, %xmm8
3322; AVX-NEXT:    vpblendvb %xmm3, %xmm1, %xmm8, %xmm1
3323; AVX-NEXT:    vandps %ymm0, %ymm11, %ymm0
3324; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
3325; AVX-NEXT:    vandnps %ymm1, %ymm11, %ymm1
3326; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
3327; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3328; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm9[4,10,u,u,u,u,u,u,u,u,u,u,u]
3329; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm13[2,8,14],zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u,u]
3330; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
3331; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [0,0,0,4,10,128,128,128,0,0,0,4,10,128,128,128]
3332; AVX-NEXT:    # xmm1 = mem[0,0]
3333; AVX-NEXT:    vpshufb %xmm1, %xmm7, %xmm10
3334; AVX-NEXT:    vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,0,6,12,0,0,0,128,128,0,6,12]
3335; AVX-NEXT:    # xmm11 = mem[0,0]
3336; AVX-NEXT:    vpshufb %xmm11, %xmm12, %xmm14
3337; AVX-NEXT:    vpor %xmm10, %xmm14, %xmm10
3338; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm10, %ymm0
3339; AVX-NEXT:    vandnps %ymm0, %ymm3, %ymm0
3340; AVX-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload
3341; AVX-NEXT:    vorps %ymm0, %ymm10, %ymm0
3342; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm5[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u]
3343; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,0,6,12,u,u,u,u,u,u,u,u]
3344; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm10 = xmm14[0],xmm10[0]
3345; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
3346; AVX-NEXT:    vpshufb %xmm11, %xmm4, %xmm11
3347; AVX-NEXT:    vpor %xmm1, %xmm11, %xmm1
3348; AVX-NEXT:    vpblendvb %xmm3, %xmm10, %xmm1, %xmm1
3349; AVX-NEXT:    vmovaps {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
3350; AVX-NEXT:    vandps %ymm0, %ymm10, %ymm0
3351; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
3352; AVX-NEXT:    vandnps %ymm1, %ymm10, %ymm1
3353; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm11
3354; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm9[5,11,u,u,u,u,u,u,u,u,u,u,u]
3355; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm13[3,9,15],zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u,u]
3356; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
3357; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [0,0,0,5,11,128,128,128,0,0,0,5,11,128,128,128]
3358; AVX-NEXT:    # xmm1 = mem[0,0]
3359; AVX-NEXT:    vpshufb %xmm1, %xmm7, %xmm14
3360; AVX-NEXT:    vmovddup {{.*#+}} xmm15 = [0,0,0,128,128,1,7,13,0,0,0,128,128,1,7,13]
3361; AVX-NEXT:    # xmm15 = mem[0,0]
3362; AVX-NEXT:    vpshufb %xmm15, %xmm12, %xmm8
3363; AVX-NEXT:    vpor %xmm14, %xmm8, %xmm8
3364; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm8, %ymm0
3365; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm5[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u]
3366; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,1,7,13,u,u,u,u,u,u,u,u]
3367; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm14[0],xmm8[0]
3368; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
3369; AVX-NEXT:    vpshufb %xmm15, %xmm4, %xmm14
3370; AVX-NEXT:    vpor %xmm1, %xmm14, %xmm1
3371; AVX-NEXT:    vpblendvb %xmm3, %xmm8, %xmm1, %xmm1
3372; AVX-NEXT:    vandnps %ymm0, %ymm3, %ymm0
3373; AVX-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
3374; AVX-NEXT:    vorps %ymm0, %ymm3, %ymm0
3375; AVX-NEXT:    vandps %ymm0, %ymm10, %ymm0
3376; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
3377; AVX-NEXT:    vandnps %ymm1, %ymm10, %ymm1
3378; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm3
3379; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm13[4,10],zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u,u]
3380; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm9[0,6,12,u,u,u,u,u,u,u,u,u,u,u]
3381; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
3382; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [0,0,128,128,128,2,8,14,0,0,128,128,128,2,8,14]
3383; AVX-NEXT:    # xmm1 = mem[0,0]
3384; AVX-NEXT:    vpshufb %xmm1, %xmm12, %xmm8
3385; AVX-NEXT:    vmovddup {{.*#+}} xmm14 = [0,0,0,6,12,128,128,128,0,0,0,6,12,128,128,128]
3386; AVX-NEXT:    # xmm14 = mem[0,0]
3387; AVX-NEXT:    vpshufb %xmm14, %xmm7, %xmm15
3388; AVX-NEXT:    vpor %xmm8, %xmm15, %xmm8
3389; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm8, %ymm0
3390; AVX-NEXT:    vmovaps {{.*#+}} ymm8 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
3391; AVX-NEXT:    vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm15 # 32-byte Folded Reload
3392; AVX-NEXT:    vandps %ymm0, %ymm8, %ymm0
3393; AVX-NEXT:    vorps %ymm0, %ymm15, %ymm0
3394; AVX-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
3395; AVX-NEXT:    vpshufb %xmm14, %xmm2, %xmm14
3396; AVX-NEXT:    vpor %xmm1, %xmm14, %xmm1
3397; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u]
3398; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14]
3399; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm14 = xmm15[1],xmm14[1]
3400; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm14[0,1,2,3,4],xmm1[5,6,7]
3401; AVX-NEXT:    vandps %ymm0, %ymm10, %ymm0
3402; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
3403; AVX-NEXT:    vandnps %ymm1, %ymm10, %ymm1
3404; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
3405; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm13[5,11],zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u,u]
3406; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[1,7,13,u,u,u,u,u,u,u,u,u,u,u]
3407; AVX-NEXT:    vpor %xmm1, %xmm9, %xmm1
3408; AVX-NEXT:    vmovddup {{.*#+}} xmm9 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15]
3409; AVX-NEXT:    # xmm9 = mem[0,0]
3410; AVX-NEXT:    vpshufb %xmm9, %xmm12, %xmm12
3411; AVX-NEXT:    vmovddup {{.*#+}} xmm13 = [0,0,1,7,13,128,128,128,0,0,1,7,13,128,128,128]
3412; AVX-NEXT:    # xmm13 = mem[0,0]
3413; AVX-NEXT:    vpshufb %xmm13, %xmm7, %xmm7
3414; AVX-NEXT:    vpor %xmm7, %xmm12, %xmm7
3415; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm7, %ymm1
3416; AVX-NEXT:    vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload
3417; AVX-NEXT:    vandps %ymm1, %ymm8, %ymm1
3418; AVX-NEXT:    vorps %ymm7, %ymm1, %ymm1
3419; AVX-NEXT:    vpshufb %xmm9, %xmm4, %xmm4
3420; AVX-NEXT:    vpshufb %xmm13, %xmm2, %xmm2
3421; AVX-NEXT:    vpor %xmm4, %xmm2, %xmm2
3422; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u]
3423; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15]
3424; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm5[1],xmm4[1]
3425; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3,4],xmm2[5,6,7]
3426; AVX-NEXT:    vandps %ymm1, %ymm10, %ymm1
3427; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
3428; AVX-NEXT:    vandnps %ymm2, %ymm10, %ymm2
3429; AVX-NEXT:    vorps %ymm2, %ymm1, %ymm1
3430; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3431; AVX-NEXT:    vmovaps %ymm2, (%rsi)
3432; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3433; AVX-NEXT:    vmovaps %ymm2, (%rdx)
3434; AVX-NEXT:    vmovaps %ymm11, (%rcx)
3435; AVX-NEXT:    vmovaps %ymm3, (%r8)
3436; AVX-NEXT:    vmovaps %ymm0, (%r9)
3437; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3438; AVX-NEXT:    vmovaps %ymm1, (%rax)
3439; AVX-NEXT:    addq $120, %rsp
3440; AVX-NEXT:    vzeroupper
3441; AVX-NEXT:    retq
3442;
3443; AVX2-LABEL: load_i8_stride6_vf32:
3444; AVX2:       # %bb.0:
3445; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm4
3446; AVX2-NEXT:    vmovdqa (%rdi), %ymm2
3447; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm3
3448; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm0
3449; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm1
3450; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
3451; AVX2-NEXT:    vpblendvb %ymm8, %ymm2, %ymm3, %ymm9
3452; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm9[0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u]
3453; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm10
3454; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm10[2,8,14],zero,zero,xmm10[0,6,12,u,u,u,u,u]
3455; AVX2-NEXT:    vpor %xmm5, %xmm6, %xmm11
3456; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0]
3457; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1]
3458; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3]
3459; AVX2-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm1
3460; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
3461; AVX2-NEXT:    vpmovsxdq {{.*#+}} xmm7 = [18446744073709551615,16777215]
3462; AVX2-NEXT:    vpblendvb %ymm7, %ymm11, %ymm0, %ymm0
3463; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[1,7,13],zero,zero,zero,xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u]
3464; AVX2-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[3,9,15],zero,zero,xmm10[1,7,13,u,u,u,u,u]
3465; AVX2-NEXT:    vpor %xmm9, %xmm10, %xmm9
3466; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u]
3467; AVX2-NEXT:    vpblendvb %ymm7, %ymm9, %ymm1, %ymm1
3468; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
3469; AVX2-NEXT:    vpblendvb %ymm11, %ymm3, %ymm2, %ymm9
3470; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm10
3471; AVX2-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u]
3472; AVX2-NEXT:    vpshufb {{.*#+}} xmm13 = xmm9[2,8,14],zero,zero,xmm9[0,6,12],zero,zero,zero,xmm9[u,u,u,u,u]
3473; AVX2-NEXT:    vpor %xmm12, %xmm13, %xmm12
3474; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0]
3475; AVX2-NEXT:    vpblendvb %ymm13, %ymm6, %ymm5, %ymm13
3476; AVX2-NEXT:    vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u]
3477; AVX2-NEXT:    vpblendvb %ymm7, %ymm12, %ymm14, %ymm12
3478; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm14
3479; AVX2-NEXT:    vpblendvb %ymm8, %ymm14, %ymm4, %ymm8
3480; AVX2-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u]
3481; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[3,9,15],zero,zero,xmm9[1,7,13],zero,zero,zero,xmm9[u,u,u,u,u]
3482; AVX2-NEXT:    vpor %xmm10, %xmm9, %xmm9
3483; AVX2-NEXT:    vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,0,6,12],zero,zero,zero,xmm8[4,10],zero,zero,zero
3484; AVX2-NEXT:    vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u]
3485; AVX2-NEXT:    vpblendvb %ymm7, %ymm9, %ymm13, %ymm13
3486; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm9
3487; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12]
3488; AVX2-NEXT:    vpor %xmm7, %xmm10, %xmm7
3489; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
3490; AVX2-NEXT:    vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10
3491; AVX2-NEXT:    vpblendvb %ymm10, %ymm12, %ymm7, %ymm7
3492; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
3493; AVX2-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,1,7,13],zero,zero,zero,xmm8[5,11],zero,zero,zero
3494; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13]
3495; AVX2-NEXT:    vpor %xmm8, %xmm9, %xmm8
3496; AVX2-NEXT:    vpblendvb %ymm12, %ymm4, %ymm14, %ymm9
3497; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
3498; AVX2-NEXT:    vpblendvb %ymm10, %ymm13, %ymm8, %ymm8
3499; AVX2-NEXT:    vpblendvb %ymm11, %ymm4, %ymm14, %ymm4
3500; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm11
3501; AVX2-NEXT:    vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14]
3502; AVX2-NEXT:    vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero
3503; AVX2-NEXT:    vpor %xmm13, %xmm14, %xmm13
3504; AVX2-NEXT:    vpblendvb %ymm12, %ymm3, %ymm2, %ymm2
3505; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
3506; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0]
3507; AVX2-NEXT:    vpblendvb %ymm12, %ymm6, %ymm5, %ymm5
3508; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u]
3509; AVX2-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[4,10],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[u,u,u,u,u,u]
3510; AVX2-NEXT:    vpor %xmm6, %xmm12, %xmm6
3511; AVX2-NEXT:    vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
3512; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm12[5,6,7]
3513; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
3514; AVX2-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm12
3515; AVX2-NEXT:    vpblendvb %ymm10, %ymm6, %ymm12, %ymm6
3516; AVX2-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15]
3517; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero
3518; AVX2-NEXT:    vpor %xmm4, %xmm11, %xmm4
3519; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u]
3520; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[5,11],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[u,u,u,u,u,u]
3521; AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
3522; AVX2-NEXT:    vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u]
3523; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
3524; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
3525; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm3
3526; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
3527; AVX2-NEXT:    vpblendvb %ymm10, %ymm2, %ymm4, %ymm2
3528; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10]
3529; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,4,10],zero,zero,zero,xmm9[2,8,14],zero,zero
3530; AVX2-NEXT:    vpor %xmm4, %xmm5, %xmm4
3531; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
3532; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15]
3533; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
3534; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11]
3535; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,5,11],zero,zero,zero,xmm9[3,9,15],zero,zero
3536; AVX2-NEXT:    vpor %xmm3, %xmm4, %xmm3
3537; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
3538; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15]
3539; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
3540; AVX2-NEXT:    vmovdqa %ymm0, (%rsi)
3541; AVX2-NEXT:    vmovdqa %ymm1, (%rdx)
3542; AVX2-NEXT:    vmovdqa %ymm7, (%rcx)
3543; AVX2-NEXT:    vmovdqa %ymm8, (%r8)
3544; AVX2-NEXT:    vmovdqa %ymm6, (%r9)
3545; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3546; AVX2-NEXT:    vmovdqa %ymm2, (%rax)
3547; AVX2-NEXT:    vzeroupper
3548; AVX2-NEXT:    retq
3549;
3550; AVX2-FP-LABEL: load_i8_stride6_vf32:
3551; AVX2-FP:       # %bb.0:
3552; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm4
3553; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm2
3554; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm3
3555; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm0
3556; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm1
3557; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
3558; AVX2-FP-NEXT:    vpblendvb %ymm8, %ymm2, %ymm3, %ymm9
3559; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm9[0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u]
3560; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm10
3561; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm10[2,8,14],zero,zero,xmm10[0,6,12,u,u,u,u,u]
3562; AVX2-FP-NEXT:    vpor %xmm5, %xmm6, %xmm11
3563; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0]
3564; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1]
3565; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3]
3566; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm1
3567; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
3568; AVX2-FP-NEXT:    vpmovsxdq {{.*#+}} xmm7 = [18446744073709551615,16777215]
3569; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm11, %ymm0, %ymm0
3570; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[1,7,13],zero,zero,zero,xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u]
3571; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[3,9,15],zero,zero,xmm10[1,7,13,u,u,u,u,u]
3572; AVX2-FP-NEXT:    vpor %xmm9, %xmm10, %xmm9
3573; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u]
3574; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm9, %ymm1, %ymm1
3575; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
3576; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm3, %ymm2, %ymm9
3577; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm10
3578; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u]
3579; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm9[2,8,14],zero,zero,xmm9[0,6,12],zero,zero,zero,xmm9[u,u,u,u,u]
3580; AVX2-FP-NEXT:    vpor %xmm12, %xmm13, %xmm12
3581; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0]
3582; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm6, %ymm5, %ymm13
3583; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u]
3584; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm12, %ymm14, %ymm12
3585; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm14
3586; AVX2-FP-NEXT:    vpblendvb %ymm8, %ymm14, %ymm4, %ymm8
3587; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u]
3588; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[3,9,15],zero,zero,xmm9[1,7,13],zero,zero,zero,xmm9[u,u,u,u,u]
3589; AVX2-FP-NEXT:    vpor %xmm10, %xmm9, %xmm9
3590; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,0,6,12],zero,zero,zero,xmm8[4,10],zero,zero,zero
3591; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u]
3592; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm9, %ymm13, %ymm13
3593; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm9
3594; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12]
3595; AVX2-FP-NEXT:    vpor %xmm7, %xmm10, %xmm7
3596; AVX2-FP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
3597; AVX2-FP-NEXT:    vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10
3598; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm12, %ymm7, %ymm7
3599; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
3600; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,1,7,13],zero,zero,zero,xmm8[5,11],zero,zero,zero
3601; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13]
3602; AVX2-FP-NEXT:    vpor %xmm8, %xmm9, %xmm8
3603; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm4, %ymm14, %ymm9
3604; AVX2-FP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
3605; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm13, %ymm8, %ymm8
3606; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm4, %ymm14, %ymm4
3607; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm11
3608; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14]
3609; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero
3610; AVX2-FP-NEXT:    vpor %xmm13, %xmm14, %xmm13
3611; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm3, %ymm2, %ymm2
3612; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm3
3613; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0]
3614; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm6, %ymm5, %ymm5
3615; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u]
3616; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[4,10],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[u,u,u,u,u,u]
3617; AVX2-FP-NEXT:    vpor %xmm6, %xmm12, %xmm6
3618; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
3619; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm12[5,6,7]
3620; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
3621; AVX2-FP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm12
3622; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm6, %ymm12, %ymm6
3623; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15]
3624; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero
3625; AVX2-FP-NEXT:    vpor %xmm4, %xmm11, %xmm4
3626; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u]
3627; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[5,11],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[u,u,u,u,u,u]
3628; AVX2-FP-NEXT:    vpor %xmm3, %xmm2, %xmm2
3629; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u]
3630; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
3631; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
3632; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm3
3633; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
3634; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm2, %ymm4, %ymm2
3635; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10]
3636; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,4,10],zero,zero,zero,xmm9[2,8,14],zero,zero
3637; AVX2-FP-NEXT:    vpor %xmm4, %xmm5, %xmm4
3638; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
3639; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15]
3640; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
3641; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11]
3642; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,5,11],zero,zero,zero,xmm9[3,9,15],zero,zero
3643; AVX2-FP-NEXT:    vpor %xmm3, %xmm4, %xmm3
3644; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
3645; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15]
3646; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
3647; AVX2-FP-NEXT:    vmovdqa %ymm0, (%rsi)
3648; AVX2-FP-NEXT:    vmovdqa %ymm1, (%rdx)
3649; AVX2-FP-NEXT:    vmovdqa %ymm7, (%rcx)
3650; AVX2-FP-NEXT:    vmovdqa %ymm8, (%r8)
3651; AVX2-FP-NEXT:    vmovdqa %ymm6, (%r9)
3652; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3653; AVX2-FP-NEXT:    vmovdqa %ymm2, (%rax)
3654; AVX2-FP-NEXT:    vzeroupper
3655; AVX2-FP-NEXT:    retq
3656;
3657; AVX2-FCP-LABEL: load_i8_stride6_vf32:
3658; AVX2-FCP:       # %bb.0:
3659; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm4
3660; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm2
3661; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
3662; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
3663; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
3664; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
3665; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm2, %ymm3, %ymm9
3666; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm9[0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u]
3667; AVX2-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
3668; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm10[2,8,14],zero,zero,xmm10[0,6,12,u,u,u,u,u]
3669; AVX2-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm11
3670; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0]
3671; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1]
3672; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3]
3673; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm1
3674; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
3675; AVX2-FCP-NEXT:    vpmovsxdq {{.*#+}} xmm7 = [18446744073709551615,16777215]
3676; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm11, %ymm0, %ymm0
3677; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[1,7,13],zero,zero,zero,xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u]
3678; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[3,9,15],zero,zero,xmm10[1,7,13,u,u,u,u,u]
3679; AVX2-FCP-NEXT:    vpor %xmm9, %xmm10, %xmm9
3680; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u]
3681; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm9, %ymm1, %ymm1
3682; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
3683; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm3, %ymm2, %ymm9
3684; AVX2-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
3685; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u]
3686; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm9[2,8,14],zero,zero,xmm9[0,6,12],zero,zero,zero,xmm9[u,u,u,u,u]
3687; AVX2-FCP-NEXT:    vpor %xmm12, %xmm13, %xmm12
3688; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0]
3689; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm6, %ymm5, %ymm13
3690; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u]
3691; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm12, %ymm14, %ymm12
3692; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm14
3693; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm14, %ymm4, %ymm8
3694; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u]
3695; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[3,9,15],zero,zero,xmm9[1,7,13],zero,zero,zero,xmm9[u,u,u,u,u]
3696; AVX2-FCP-NEXT:    vpor %xmm10, %xmm9, %xmm9
3697; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,0,6,12],zero,zero,zero,xmm8[4,10],zero,zero,zero
3698; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u]
3699; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm9, %ymm13, %ymm13
3700; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
3701; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12]
3702; AVX2-FCP-NEXT:    vpor %xmm7, %xmm10, %xmm7
3703; AVX2-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
3704; AVX2-FCP-NEXT:    vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10
3705; AVX2-FCP-NEXT:    vpblendvb %ymm10, %ymm12, %ymm7, %ymm7
3706; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
3707; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,1,7,13],zero,zero,zero,xmm8[5,11],zero,zero,zero
3708; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13]
3709; AVX2-FCP-NEXT:    vpor %xmm8, %xmm9, %xmm8
3710; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm4, %ymm14, %ymm9
3711; AVX2-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
3712; AVX2-FCP-NEXT:    vpblendvb %ymm10, %ymm13, %ymm8, %ymm8
3713; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm4, %ymm14, %ymm4
3714; AVX2-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm11
3715; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14]
3716; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero
3717; AVX2-FCP-NEXT:    vpor %xmm13, %xmm14, %xmm13
3718; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm3, %ymm2, %ymm2
3719; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
3720; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0]
3721; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm6, %ymm5, %ymm5
3722; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u]
3723; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[4,10],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[u,u,u,u,u,u]
3724; AVX2-FCP-NEXT:    vpor %xmm6, %xmm12, %xmm6
3725; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
3726; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm12[5,6,7]
3727; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
3728; AVX2-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm12
3729; AVX2-FCP-NEXT:    vpblendvb %ymm10, %ymm6, %ymm12, %ymm6
3730; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15]
3731; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero
3732; AVX2-FCP-NEXT:    vpor %xmm4, %xmm11, %xmm4
3733; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u]
3734; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[5,11],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[u,u,u,u,u,u]
3735; AVX2-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
3736; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u]
3737; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
3738; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
3739; AVX2-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm3
3740; AVX2-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
3741; AVX2-FCP-NEXT:    vpblendvb %ymm10, %ymm2, %ymm4, %ymm2
3742; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10]
3743; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,4,10],zero,zero,zero,xmm9[2,8,14],zero,zero
3744; AVX2-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
3745; AVX2-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
3746; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15]
3747; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
3748; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11]
3749; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,5,11],zero,zero,zero,xmm9[3,9,15],zero,zero
3750; AVX2-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
3751; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
3752; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15]
3753; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
3754; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%rsi)
3755; AVX2-FCP-NEXT:    vmovdqa %ymm1, (%rdx)
3756; AVX2-FCP-NEXT:    vmovdqa %ymm7, (%rcx)
3757; AVX2-FCP-NEXT:    vmovdqa %ymm8, (%r8)
3758; AVX2-FCP-NEXT:    vmovdqa %ymm6, (%r9)
3759; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3760; AVX2-FCP-NEXT:    vmovdqa %ymm2, (%rax)
3761; AVX2-FCP-NEXT:    vzeroupper
3762; AVX2-FCP-NEXT:    retq
3763;
3764; AVX512-LABEL: load_i8_stride6_vf32:
3765; AVX512:       # %bb.0:
3766; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3767; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
3768; AVX512-NEXT:    vmovdqa64 (%rdi), %ymm17
3769; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm3
3770; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm1
3771; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm6
3772; AVX512-NEXT:    vmovdqa %ymm0, %ymm7
3773; AVX512-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm3 ^ (ymm7 & (ymm17 ^ ymm3))
3774; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u]
3775; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm8
3776; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u]
3777; AVX512-NEXT:    vpor %xmm4, %xmm5, %xmm4
3778; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],mem[2,3]
3779; AVX512-NEXT:    vinserti128 $1, 96(%rdi), %ymm1, %ymm1
3780; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0]
3781; AVX512-NEXT:    vmovdqa %ymm9, %ymm10
3782; AVX512-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm10 & (ymm1 ^ ymm5))
3783; AVX512-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3784; AVX512-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
3785; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm4 & ymm16)
3786; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm13
3787; AVX512-NEXT:    vmovdqa %ymm0, %ymm14
3788; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm6 ^ (ymm14 & (ymm13 ^ ymm6))
3789; AVX512-NEXT:    vextracti128 $1, %ymm14, %xmm15
3790; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10]
3791; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero
3792; AVX512-NEXT:    vpor %xmm4, %xmm12, %xmm4
3793; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
3794; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7],ymm11[8,9,10],ymm4[11,12,13,14,15]
3795; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm4[4,5,6,7]
3796; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm18
3797; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u]
3798; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u]
3799; AVX512-NEXT:    vpor %xmm7, %xmm8, %xmm7
3800; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3801; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm16)
3802; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11]
3803; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero
3804; AVX512-NEXT:    vpor %xmm7, %xmm10, %xmm7
3805; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
3806; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15]
3807; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
3808; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
3809; AVX512-NEXT:    vmovdqa %ymm8, %ymm10
3810; AVX512-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm17 ^ (ymm10 & (ymm3 ^ ymm17))
3811; AVX512-NEXT:    vextracti128 $1, %ymm10, %xmm11
3812; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u]
3813; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u]
3814; AVX512-NEXT:    vpor %xmm12, %xmm14, %xmm12
3815; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm5 ^ ymm1))
3816; AVX512-NEXT:    vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u]
3817; AVX512-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
3818; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm12 & ymm16)
3819; AVX512-NEXT:    vmovdqa %ymm0, %ymm12
3820; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm13 ^ (ymm12 & (ymm6 ^ ymm13))
3821; AVX512-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero
3822; AVX512-NEXT:    vextracti128 $1, %ymm12, %xmm2
3823; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12]
3824; AVX512-NEXT:    vpor %xmm4, %xmm15, %xmm4
3825; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
3826; AVX512-NEXT:    vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0]
3827; AVX512-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm15 & (ymm4 ^ ymm14))
3828; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u]
3829; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u]
3830; AVX512-NEXT:    vpor %xmm11, %xmm10, %xmm10
3831; AVX512-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u]
3832; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm10 & ymm16)
3833; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero
3834; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13]
3835; AVX512-NEXT:    vpor %xmm2, %xmm10, %xmm2
3836; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
3837; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm15 & (ymm2 ^ ymm9))
3838; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm13 ^ ymm6))
3839; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm6
3840; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14]
3841; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero
3842; AVX512-NEXT:    vpor %xmm9, %xmm10, %xmm9
3843; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
3844; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm3 ^ ymm17))
3845; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm3
3846; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u]
3847; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u]
3848; AVX512-NEXT:    vpor %xmm10, %xmm11, %xmm10
3849; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm5))
3850; AVX512-NEXT:    vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
3851; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7]
3852; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7]
3853; AVX512-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm9 ^ (ymm15 & (ymm5 ^ ymm9))
3854; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15]
3855; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero
3856; AVX512-NEXT:    vpor %xmm6, %xmm8, %xmm6
3857; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
3858; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u]
3859; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u]
3860; AVX512-NEXT:    vpor %xmm3, %xmm0, %xmm0
3861; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u]
3862; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
3863; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3864; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm6 ^ (ymm15 & (ymm0 ^ ymm6))
3865; AVX512-NEXT:    vmovdqa64 %ymm18, (%rsi)
3866; AVX512-NEXT:    vmovdqa %ymm7, (%rdx)
3867; AVX512-NEXT:    vmovdqa %ymm4, (%rcx)
3868; AVX512-NEXT:    vmovdqa %ymm2, (%r8)
3869; AVX512-NEXT:    vmovdqa %ymm5, (%r9)
3870; AVX512-NEXT:    vmovdqa %ymm0, (%rax)
3871; AVX512-NEXT:    vzeroupper
3872; AVX512-NEXT:    retq
3873;
3874; AVX512-FCP-LABEL: load_i8_stride6_vf32:
3875; AVX512-FCP:       # %bb.0:
3876; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3877; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
3878; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %ymm17
3879; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
3880; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm1
3881; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm6
3882; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm7
3883; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm3 ^ (ymm7 & (ymm17 ^ ymm3))
3884; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u]
3885; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
3886; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u]
3887; AVX512-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
3888; AVX512-FCP-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],mem[2,3]
3889; AVX512-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm1, %ymm1
3890; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0]
3891; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm10
3892; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm10 & (ymm1 ^ ymm5))
3893; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3894; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
3895; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm4 & ymm16)
3896; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm13
3897; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm14
3898; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm6 ^ (ymm14 & (ymm13 ^ ymm6))
3899; AVX512-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
3900; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10]
3901; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero
3902; AVX512-FCP-NEXT:    vpor %xmm4, %xmm12, %xmm4
3903; AVX512-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
3904; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7],ymm11[8,9,10],ymm4[11,12,13,14,15]
3905; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm4[4,5,6,7]
3906; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm18
3907; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u]
3908; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u]
3909; AVX512-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
3910; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3911; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm16)
3912; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11]
3913; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero
3914; AVX512-FCP-NEXT:    vpor %xmm7, %xmm10, %xmm7
3915; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
3916; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15]
3917; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
3918; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
3919; AVX512-FCP-NEXT:    vmovdqa %ymm8, %ymm10
3920; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm17 ^ (ymm10 & (ymm3 ^ ymm17))
3921; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
3922; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u]
3923; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u]
3924; AVX512-FCP-NEXT:    vpor %xmm12, %xmm14, %xmm12
3925; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm5 ^ ymm1))
3926; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u]
3927; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
3928; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm12 & ymm16)
3929; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm12
3930; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm13 ^ (ymm12 & (ymm6 ^ ymm13))
3931; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero
3932; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm2
3933; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12]
3934; AVX512-FCP-NEXT:    vpor %xmm4, %xmm15, %xmm4
3935; AVX512-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
3936; AVX512-FCP-NEXT:    vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0]
3937; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm15 & (ymm4 ^ ymm14))
3938; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u]
3939; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u]
3940; AVX512-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
3941; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u]
3942; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm10 & ymm16)
3943; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero
3944; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13]
3945; AVX512-FCP-NEXT:    vpor %xmm2, %xmm10, %xmm2
3946; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
3947; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm15 & (ymm2 ^ ymm9))
3948; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm13 ^ ymm6))
3949; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm6
3950; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14]
3951; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero
3952; AVX512-FCP-NEXT:    vpor %xmm9, %xmm10, %xmm9
3953; AVX512-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
3954; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm3 ^ ymm17))
3955; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
3956; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u]
3957; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u]
3958; AVX512-FCP-NEXT:    vpor %xmm10, %xmm11, %xmm10
3959; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm5))
3960; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
3961; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7]
3962; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7]
3963; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm9 ^ (ymm15 & (ymm5 ^ ymm9))
3964; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15]
3965; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero
3966; AVX512-FCP-NEXT:    vpor %xmm6, %xmm8, %xmm6
3967; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
3968; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u]
3969; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u]
3970; AVX512-FCP-NEXT:    vpor %xmm3, %xmm0, %xmm0
3971; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u]
3972; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
3973; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3974; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm6 ^ (ymm15 & (ymm0 ^ ymm6))
3975; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, (%rsi)
3976; AVX512-FCP-NEXT:    vmovdqa %ymm7, (%rdx)
3977; AVX512-FCP-NEXT:    vmovdqa %ymm4, (%rcx)
3978; AVX512-FCP-NEXT:    vmovdqa %ymm2, (%r8)
3979; AVX512-FCP-NEXT:    vmovdqa %ymm5, (%r9)
3980; AVX512-FCP-NEXT:    vmovdqa %ymm0, (%rax)
3981; AVX512-FCP-NEXT:    vzeroupper
3982; AVX512-FCP-NEXT:    retq
3983;
3984; AVX512DQ-LABEL: load_i8_stride6_vf32:
3985; AVX512DQ:       # %bb.0:
3986; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3987; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
3988; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %ymm17
3989; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm3
3990; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm1
3991; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm6
3992; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm7
3993; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm3 ^ (ymm7 & (ymm17 ^ ymm3))
3994; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u]
3995; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm8
3996; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u]
3997; AVX512DQ-NEXT:    vpor %xmm4, %xmm5, %xmm4
3998; AVX512DQ-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],mem[2,3]
3999; AVX512DQ-NEXT:    vinserti128 $1, 96(%rdi), %ymm1, %ymm1
4000; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0]
4001; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm10
4002; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm10 & (ymm1 ^ ymm5))
4003; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4004; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
4005; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm4 & ymm16)
4006; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm13
4007; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm14
4008; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm6 ^ (ymm14 & (ymm13 ^ ymm6))
4009; AVX512DQ-NEXT:    vextracti128 $1, %ymm14, %xmm15
4010; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10]
4011; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero
4012; AVX512DQ-NEXT:    vpor %xmm4, %xmm12, %xmm4
4013; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
4014; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7],ymm11[8,9,10],ymm4[11,12,13,14,15]
4015; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm4[4,5,6,7]
4016; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm18
4017; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u]
4018; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u]
4019; AVX512DQ-NEXT:    vpor %xmm7, %xmm8, %xmm7
4020; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4021; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm16)
4022; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11]
4023; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero
4024; AVX512DQ-NEXT:    vpor %xmm7, %xmm10, %xmm7
4025; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
4026; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15]
4027; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
4028; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
4029; AVX512DQ-NEXT:    vmovdqa %ymm8, %ymm10
4030; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm17 ^ (ymm10 & (ymm3 ^ ymm17))
4031; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm11
4032; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u]
4033; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u]
4034; AVX512DQ-NEXT:    vpor %xmm12, %xmm14, %xmm12
4035; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm5 ^ ymm1))
4036; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u]
4037; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
4038; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm12 & ymm16)
4039; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm12
4040; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm13 ^ (ymm12 & (ymm6 ^ ymm13))
4041; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero
4042; AVX512DQ-NEXT:    vextracti128 $1, %ymm12, %xmm2
4043; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12]
4044; AVX512DQ-NEXT:    vpor %xmm4, %xmm15, %xmm4
4045; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
4046; AVX512DQ-NEXT:    vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0]
4047; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm15 & (ymm4 ^ ymm14))
4048; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u]
4049; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u]
4050; AVX512DQ-NEXT:    vpor %xmm11, %xmm10, %xmm10
4051; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u]
4052; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm10 & ymm16)
4053; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero
4054; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13]
4055; AVX512DQ-NEXT:    vpor %xmm2, %xmm10, %xmm2
4056; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
4057; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm15 & (ymm2 ^ ymm9))
4058; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm13 ^ ymm6))
4059; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm6
4060; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14]
4061; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero
4062; AVX512DQ-NEXT:    vpor %xmm9, %xmm10, %xmm9
4063; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
4064; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm3 ^ ymm17))
4065; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
4066; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u]
4067; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u]
4068; AVX512DQ-NEXT:    vpor %xmm10, %xmm11, %xmm10
4069; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm5))
4070; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
4071; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7]
4072; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7]
4073; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm9 ^ (ymm15 & (ymm5 ^ ymm9))
4074; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15]
4075; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero
4076; AVX512DQ-NEXT:    vpor %xmm6, %xmm8, %xmm6
4077; AVX512DQ-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
4078; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u]
4079; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u]
4080; AVX512DQ-NEXT:    vpor %xmm3, %xmm0, %xmm0
4081; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u]
4082; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
4083; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4084; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm6 ^ (ymm15 & (ymm0 ^ ymm6))
4085; AVX512DQ-NEXT:    vmovdqa64 %ymm18, (%rsi)
4086; AVX512DQ-NEXT:    vmovdqa %ymm7, (%rdx)
4087; AVX512DQ-NEXT:    vmovdqa %ymm4, (%rcx)
4088; AVX512DQ-NEXT:    vmovdqa %ymm2, (%r8)
4089; AVX512DQ-NEXT:    vmovdqa %ymm5, (%r9)
4090; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rax)
4091; AVX512DQ-NEXT:    vzeroupper
4092; AVX512DQ-NEXT:    retq
4093;
4094; AVX512DQ-FCP-LABEL: load_i8_stride6_vf32:
4095; AVX512DQ-FCP:       # %bb.0:
4096; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4097; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
4098; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %ymm17
4099; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
4100; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm1
4101; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm6
4102; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm7
4103; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm3 ^ (ymm7 & (ymm17 ^ ymm3))
4104; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u]
4105; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
4106; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u]
4107; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
4108; AVX512DQ-FCP-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],mem[2,3]
4109; AVX512DQ-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm1, %ymm1
4110; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0]
4111; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm10
4112; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm10 & (ymm1 ^ ymm5))
4113; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4114; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
4115; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm4 & ymm16)
4116; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm13
4117; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm14
4118; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm6 ^ (ymm14 & (ymm13 ^ ymm6))
4119; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
4120; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10]
4121; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero
4122; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm12, %xmm4
4123; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
4124; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7],ymm11[8,9,10],ymm4[11,12,13,14,15]
4125; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm4[4,5,6,7]
4126; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm18
4127; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u]
4128; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u]
4129; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
4130; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4131; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm16)
4132; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11]
4133; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero
4134; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm10, %xmm7
4135; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
4136; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15]
4137; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
4138; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
4139; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, %ymm10
4140; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm17 ^ (ymm10 & (ymm3 ^ ymm17))
4141; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
4142; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u]
4143; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u]
4144; AVX512DQ-FCP-NEXT:    vpor %xmm12, %xmm14, %xmm12
4145; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm5 ^ ymm1))
4146; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u]
4147; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
4148; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm12 & ymm16)
4149; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm12
4150; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm13 ^ (ymm12 & (ymm6 ^ ymm13))
4151; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero
4152; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm2
4153; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12]
4154; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm15, %xmm4
4155; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
4156; AVX512DQ-FCP-NEXT:    vpmovsxwd {{.*#+}} ymm15 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0]
4157; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm15 & (ymm4 ^ ymm14))
4158; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u]
4159; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u]
4160; AVX512DQ-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
4161; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u]
4162; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm10 & ymm16)
4163; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero
4164; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13]
4165; AVX512DQ-FCP-NEXT:    vpor %xmm2, %xmm10, %xmm2
4166; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
4167; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm15 & (ymm2 ^ ymm9))
4168; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm13 ^ ymm6))
4169; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm6
4170; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14]
4171; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero
4172; AVX512DQ-FCP-NEXT:    vpor %xmm9, %xmm10, %xmm9
4173; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
4174; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm3 ^ ymm17))
4175; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm3
4176; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u]
4177; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u]
4178; AVX512DQ-FCP-NEXT:    vpor %xmm10, %xmm11, %xmm10
4179; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm5))
4180; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
4181; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7]
4182; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7]
4183; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm9 ^ (ymm15 & (ymm5 ^ ymm9))
4184; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15]
4185; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero
4186; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm8, %xmm6
4187; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
4188; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u]
4189; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u]
4190; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm0, %xmm0
4191; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u]
4192; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
4193; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4194; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm6 ^ (ymm15 & (ymm0 ^ ymm6))
4195; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, (%rsi)
4196; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, (%rdx)
4197; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, (%rcx)
4198; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, (%r8)
4199; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, (%r9)
4200; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, (%rax)
4201; AVX512DQ-FCP-NEXT:    vzeroupper
4202; AVX512DQ-FCP-NEXT:    retq
4203;
4204; AVX512BW-LABEL: load_i8_stride6_vf32:
4205; AVX512BW:       # %bb.0:
4206; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4207; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm4
4208; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm0
4209; AVX512BW-NEXT:    vmovdqa 64(%rdi), %ymm3
4210; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm2
4211; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3]
4212; AVX512BW-NEXT:    vinserti128 $1, 96(%rdi), %ymm3, %ymm8
4213; AVX512BW-NEXT:    movw $-28124, %r10w # imm = 0x9224
4214; AVX512BW-NEXT:    kmovd %r10d, %k2
4215; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm8, %ymm6 {%k2}
4216; AVX512BW-NEXT:    movw $18724, %r10w # imm = 0x4924
4217; AVX512BW-NEXT:    kmovd %r10d, %k1
4218; AVX512BW-NEXT:    vpblendmw %ymm0, %ymm4, %ymm7 {%k1}
4219; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u]
4220; AVX512BW-NEXT:    vextracti128 $1, %ymm7, %xmm9
4221; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12,u,u,u,u,u]
4222; AVX512BW-NEXT:    vpor %xmm3, %xmm5, %xmm5
4223; AVX512BW-NEXT:    movl $4192256, %r10d # imm = 0x3FF800
4224; AVX512BW-NEXT:    kmovd %r10d, %k3
4225; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm5 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
4226; AVX512BW-NEXT:    vmovdqa 160(%rdi), %ymm3
4227; AVX512BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm10 {%k1}
4228; AVX512BW-NEXT:    vextracti128 $1, %ymm10, %xmm11
4229; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[0,6,12],zero,zero,zero,xmm11[4,10]
4230; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,4,10],zero,zero,zero,xmm10[2,8,14],zero,zero
4231; AVX512BW-NEXT:    vpor %xmm12, %xmm13, %xmm12
4232; AVX512BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
4233; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm12 = ymm5[0,1,2],ymm12[3,4,5,6,7],ymm5[8,9,10],ymm12[11,12,13,14,15]
4234; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7]
4235; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u]
4236; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13,u,u,u,u,u]
4237; AVX512BW-NEXT:    vpor %xmm7, %xmm9, %xmm7
4238; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm7 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u]
4239; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[1,7,13],zero,zero,zero,xmm11[5,11]
4240; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,5,11],zero,zero,zero,xmm10[3,9,15],zero,zero
4241; AVX512BW-NEXT:    vpor %xmm6, %xmm9, %xmm6
4242; AVX512BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
4243; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
4244; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
4245; AVX512BW-NEXT:    vpblendmw %ymm8, %ymm1, %ymm9 {%k2}
4246; AVX512BW-NEXT:    movw $9362, %di # imm = 0x2492
4247; AVX512BW-NEXT:    kmovd %edi, %k3
4248; AVX512BW-NEXT:    vpblendmw %ymm4, %ymm0, %ymm10 {%k3}
4249; AVX512BW-NEXT:    vextracti128 $1, %ymm10, %xmm11
4250; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u]
4251; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u]
4252; AVX512BW-NEXT:    vpor %xmm7, %xmm12, %xmm7
4253; AVX512BW-NEXT:    movl $2095104, %edi # imm = 0x1FF800
4254; AVX512BW-NEXT:    kmovd %edi, %k4
4255; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm7 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u]
4256; AVX512BW-NEXT:    vpblendmw %ymm3, %ymm2, %ymm12 {%k1}
4257; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero
4258; AVX512BW-NEXT:    vextracti128 $1, %ymm12, %xmm14
4259; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[2,8,14],zero,zero,xmm14[0,6,12]
4260; AVX512BW-NEXT:    vpor %xmm13, %xmm15, %xmm13
4261; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
4262; AVX512BW-NEXT:    movl $-2097152, %edi # imm = 0xFFE00000
4263; AVX512BW-NEXT:    kmovd %edi, %k2
4264; AVX512BW-NEXT:    vmovdqu8 %ymm13, %ymm7 {%k2}
4265; AVX512BW-NEXT:    movw $9289, %di # imm = 0x2449
4266; AVX512BW-NEXT:    kmovd %edi, %k5
4267; AVX512BW-NEXT:    vmovdqu16 %ymm8, %ymm1 {%k5}
4268; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u]
4269; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u]
4270; AVX512BW-NEXT:    vpor %xmm8, %xmm10, %xmm8
4271; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm8 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u]
4272; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero
4273; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[3,9,15],zero,zero,xmm14[1,7,13]
4274; AVX512BW-NEXT:    vpor %xmm9, %xmm10, %xmm9
4275; AVX512BW-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
4276; AVX512BW-NEXT:    vmovdqu8 %ymm9, %ymm8 {%k2}
4277; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
4278; AVX512BW-NEXT:    vmovdqu16 %ymm4, %ymm0 {%k1}
4279; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm4
4280; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u]
4281; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u]
4282; AVX512BW-NEXT:    vpor %xmm10, %xmm11, %xmm10
4283; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm9[5,6,7]
4284; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
4285; AVX512BW-NEXT:    vmovdqu16 %ymm2, %ymm3 {%k3}
4286; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm2
4287; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[2,8,14]
4288; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,2,8,14],zero,zero,xmm3[0,6,12],zero,zero,zero
4289; AVX512BW-NEXT:    vpor %xmm10, %xmm11, %xmm10
4290; AVX512BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
4291; AVX512BW-NEXT:    vmovdqu8 %ymm10, %ymm9 {%k2}
4292; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u]
4293; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u]
4294; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u]
4295; AVX512BW-NEXT:    vpor %xmm4, %xmm0, %xmm0
4296; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
4297; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4298; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[3,9,15]
4299; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,3,9,15],zero,zero,xmm3[1,7,13],zero,zero,zero
4300; AVX512BW-NEXT:    vpor %xmm1, %xmm2, %xmm1
4301; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
4302; AVX512BW-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k2}
4303; AVX512BW-NEXT:    vmovdqa %ymm5, (%rsi)
4304; AVX512BW-NEXT:    vmovdqa %ymm6, (%rdx)
4305; AVX512BW-NEXT:    vmovdqa %ymm7, (%rcx)
4306; AVX512BW-NEXT:    vmovdqa %ymm8, (%r8)
4307; AVX512BW-NEXT:    vmovdqa %ymm9, (%r9)
4308; AVX512BW-NEXT:    vmovdqa %ymm0, (%rax)
4309; AVX512BW-NEXT:    vzeroupper
4310; AVX512BW-NEXT:    retq
4311;
4312; AVX512BW-FCP-LABEL: load_i8_stride6_vf32:
4313; AVX512BW-FCP:       # %bb.0:
4314; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4315; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm4
4316; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
4317; AVX512BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm3
4318; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm2
4319; AVX512BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3]
4320; AVX512BW-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm3, %ymm8
4321; AVX512BW-FCP-NEXT:    movw $-28124, %r10w # imm = 0x9224
4322; AVX512BW-FCP-NEXT:    kmovd %r10d, %k2
4323; AVX512BW-FCP-NEXT:    vpblendmw %ymm1, %ymm8, %ymm6 {%k2}
4324; AVX512BW-FCP-NEXT:    movw $18724, %r10w # imm = 0x4924
4325; AVX512BW-FCP-NEXT:    kmovd %r10d, %k1
4326; AVX512BW-FCP-NEXT:    vpblendmw %ymm0, %ymm4, %ymm7 {%k1}
4327; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u]
4328; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm9
4329; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12,u,u,u,u,u]
4330; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm5, %xmm5
4331; AVX512BW-FCP-NEXT:    movl $4192256, %r10d # imm = 0x3FF800
4332; AVX512BW-FCP-NEXT:    kmovd %r10d, %k3
4333; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
4334; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm3
4335; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm10 {%k1}
4336; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
4337; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[0,6,12],zero,zero,zero,xmm11[4,10]
4338; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,4,10],zero,zero,zero,xmm10[2,8,14],zero,zero
4339; AVX512BW-FCP-NEXT:    vpor %xmm12, %xmm13, %xmm12
4340; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
4341; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm5[0,1,2],ymm12[3,4,5,6,7],ymm5[8,9,10],ymm12[11,12,13,14,15]
4342; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7]
4343; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u]
4344; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13,u,u,u,u,u]
4345; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm9, %xmm7
4346; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u]
4347; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[1,7,13],zero,zero,zero,xmm11[5,11]
4348; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,5,11],zero,zero,zero,xmm10[3,9,15],zero,zero
4349; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm9, %xmm6
4350; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
4351; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
4352; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
4353; AVX512BW-FCP-NEXT:    vpblendmw %ymm8, %ymm1, %ymm9 {%k2}
4354; AVX512BW-FCP-NEXT:    movw $9362, %di # imm = 0x2492
4355; AVX512BW-FCP-NEXT:    kmovd %edi, %k3
4356; AVX512BW-FCP-NEXT:    vpblendmw %ymm4, %ymm0, %ymm10 {%k3}
4357; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
4358; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u]
4359; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u]
4360; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm12, %xmm7
4361; AVX512BW-FCP-NEXT:    movl $2095104, %edi # imm = 0x1FF800
4362; AVX512BW-FCP-NEXT:    kmovd %edi, %k4
4363; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u]
4364; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm12 {%k1}
4365; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero
4366; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm14
4367; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[2,8,14],zero,zero,xmm14[0,6,12]
4368; AVX512BW-FCP-NEXT:    vpor %xmm13, %xmm15, %xmm13
4369; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
4370; AVX512BW-FCP-NEXT:    movl $-2097152, %edi # imm = 0xFFE00000
4371; AVX512BW-FCP-NEXT:    kmovd %edi, %k2
4372; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm13, %ymm7 {%k2}
4373; AVX512BW-FCP-NEXT:    movw $9289, %di # imm = 0x2449
4374; AVX512BW-FCP-NEXT:    kmovd %edi, %k5
4375; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm8, %ymm1 {%k5}
4376; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u]
4377; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u]
4378; AVX512BW-FCP-NEXT:    vpor %xmm8, %xmm10, %xmm8
4379; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u]
4380; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero
4381; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[3,9,15],zero,zero,xmm14[1,7,13]
4382; AVX512BW-FCP-NEXT:    vpor %xmm9, %xmm10, %xmm9
4383; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
4384; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm9, %ymm8 {%k2}
4385; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
4386; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm4, %ymm0 {%k1}
4387; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm4
4388; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u]
4389; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u]
4390; AVX512BW-FCP-NEXT:    vpor %xmm10, %xmm11, %xmm10
4391; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm9[5,6,7]
4392; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
4393; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm2, %ymm3 {%k3}
4394; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm2
4395; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[2,8,14]
4396; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,2,8,14],zero,zero,xmm3[0,6,12],zero,zero,zero
4397; AVX512BW-FCP-NEXT:    vpor %xmm10, %xmm11, %xmm10
4398; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
4399; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm10, %ymm9 {%k2}
4400; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u]
4401; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u]
4402; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u]
4403; AVX512BW-FCP-NEXT:    vpor %xmm4, %xmm0, %xmm0
4404; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
4405; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4406; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[3,9,15]
4407; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,3,9,15],zero,zero,xmm3[1,7,13],zero,zero,zero
4408; AVX512BW-FCP-NEXT:    vpor %xmm1, %xmm2, %xmm1
4409; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
4410; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k2}
4411; AVX512BW-FCP-NEXT:    vmovdqa %ymm5, (%rsi)
4412; AVX512BW-FCP-NEXT:    vmovdqa %ymm6, (%rdx)
4413; AVX512BW-FCP-NEXT:    vmovdqa %ymm7, (%rcx)
4414; AVX512BW-FCP-NEXT:    vmovdqa %ymm8, (%r8)
4415; AVX512BW-FCP-NEXT:    vmovdqa %ymm9, (%r9)
4416; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, (%rax)
4417; AVX512BW-FCP-NEXT:    vzeroupper
4418; AVX512BW-FCP-NEXT:    retq
4419;
4420; AVX512DQ-BW-LABEL: load_i8_stride6_vf32:
4421; AVX512DQ-BW:       # %bb.0:
4422; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4423; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm4
4424; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm0
4425; AVX512DQ-BW-NEXT:    vmovdqa 64(%rdi), %ymm3
4426; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %ymm2
4427; AVX512DQ-BW-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3]
4428; AVX512DQ-BW-NEXT:    vinserti128 $1, 96(%rdi), %ymm3, %ymm8
4429; AVX512DQ-BW-NEXT:    movw $-28124, %r10w # imm = 0x9224
4430; AVX512DQ-BW-NEXT:    kmovd %r10d, %k2
4431; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm8, %ymm6 {%k2}
4432; AVX512DQ-BW-NEXT:    movw $18724, %r10w # imm = 0x4924
4433; AVX512DQ-BW-NEXT:    kmovd %r10d, %k1
4434; AVX512DQ-BW-NEXT:    vpblendmw %ymm0, %ymm4, %ymm7 {%k1}
4435; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u]
4436; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm7, %xmm9
4437; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12,u,u,u,u,u]
4438; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm5, %xmm5
4439; AVX512DQ-BW-NEXT:    movl $4192256, %r10d # imm = 0x3FF800
4440; AVX512DQ-BW-NEXT:    kmovd %r10d, %k3
4441; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm5 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
4442; AVX512DQ-BW-NEXT:    vmovdqa 160(%rdi), %ymm3
4443; AVX512DQ-BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm10 {%k1}
4444; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm10, %xmm11
4445; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[0,6,12],zero,zero,zero,xmm11[4,10]
4446; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,4,10],zero,zero,zero,xmm10[2,8,14],zero,zero
4447; AVX512DQ-BW-NEXT:    vpor %xmm12, %xmm13, %xmm12
4448; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
4449; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm12 = ymm5[0,1,2],ymm12[3,4,5,6,7],ymm5[8,9,10],ymm12[11,12,13,14,15]
4450; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7]
4451; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u]
4452; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13,u,u,u,u,u]
4453; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm9, %xmm7
4454; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm7 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u]
4455; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[1,7,13],zero,zero,zero,xmm11[5,11]
4456; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,5,11],zero,zero,zero,xmm10[3,9,15],zero,zero
4457; AVX512DQ-BW-NEXT:    vpor %xmm6, %xmm9, %xmm6
4458; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
4459; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
4460; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
4461; AVX512DQ-BW-NEXT:    vpblendmw %ymm8, %ymm1, %ymm9 {%k2}
4462; AVX512DQ-BW-NEXT:    movw $9362, %di # imm = 0x2492
4463; AVX512DQ-BW-NEXT:    kmovd %edi, %k3
4464; AVX512DQ-BW-NEXT:    vpblendmw %ymm4, %ymm0, %ymm10 {%k3}
4465; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm10, %xmm11
4466; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u]
4467; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u]
4468; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm12, %xmm7
4469; AVX512DQ-BW-NEXT:    movl $2095104, %edi # imm = 0x1FF800
4470; AVX512DQ-BW-NEXT:    kmovd %edi, %k4
4471; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm7 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u]
4472; AVX512DQ-BW-NEXT:    vpblendmw %ymm3, %ymm2, %ymm12 {%k1}
4473; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero
4474; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm12, %xmm14
4475; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[2,8,14],zero,zero,xmm14[0,6,12]
4476; AVX512DQ-BW-NEXT:    vpor %xmm13, %xmm15, %xmm13
4477; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
4478; AVX512DQ-BW-NEXT:    movl $-2097152, %edi # imm = 0xFFE00000
4479; AVX512DQ-BW-NEXT:    kmovd %edi, %k2
4480; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm13, %ymm7 {%k2}
4481; AVX512DQ-BW-NEXT:    movw $9289, %di # imm = 0x2449
4482; AVX512DQ-BW-NEXT:    kmovd %edi, %k5
4483; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm8, %ymm1 {%k5}
4484; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u]
4485; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u]
4486; AVX512DQ-BW-NEXT:    vpor %xmm8, %xmm10, %xmm8
4487; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm8 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u]
4488; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero
4489; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[3,9,15],zero,zero,xmm14[1,7,13]
4490; AVX512DQ-BW-NEXT:    vpor %xmm9, %xmm10, %xmm9
4491; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
4492; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm9, %ymm8 {%k2}
4493; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
4494; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm4, %ymm0 {%k1}
4495; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm0, %xmm4
4496; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u]
4497; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u]
4498; AVX512DQ-BW-NEXT:    vpor %xmm10, %xmm11, %xmm10
4499; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm9[5,6,7]
4500; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
4501; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm2, %ymm3 {%k3}
4502; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm2
4503; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[2,8,14]
4504; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,2,8,14],zero,zero,xmm3[0,6,12],zero,zero,zero
4505; AVX512DQ-BW-NEXT:    vpor %xmm10, %xmm11, %xmm10
4506; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
4507; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm10, %ymm9 {%k2}
4508; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u]
4509; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u]
4510; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u]
4511; AVX512DQ-BW-NEXT:    vpor %xmm4, %xmm0, %xmm0
4512; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
4513; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4514; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[3,9,15]
4515; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,3,9,15],zero,zero,xmm3[1,7,13],zero,zero,zero
4516; AVX512DQ-BW-NEXT:    vpor %xmm1, %xmm2, %xmm1
4517; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
4518; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k2}
4519; AVX512DQ-BW-NEXT:    vmovdqa %ymm5, (%rsi)
4520; AVX512DQ-BW-NEXT:    vmovdqa %ymm6, (%rdx)
4521; AVX512DQ-BW-NEXT:    vmovdqa %ymm7, (%rcx)
4522; AVX512DQ-BW-NEXT:    vmovdqa %ymm8, (%r8)
4523; AVX512DQ-BW-NEXT:    vmovdqa %ymm9, (%r9)
4524; AVX512DQ-BW-NEXT:    vmovdqa %ymm0, (%rax)
4525; AVX512DQ-BW-NEXT:    vzeroupper
4526; AVX512DQ-BW-NEXT:    retq
4527;
4528; AVX512DQ-BW-FCP-LABEL: load_i8_stride6_vf32:
4529; AVX512DQ-BW-FCP:       # %bb.0:
4530; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4531; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm4
4532; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
4533; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm3
4534; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm2
4535; AVX512DQ-BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3]
4536; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm3, %ymm8
4537; AVX512DQ-BW-FCP-NEXT:    movw $-28124, %r10w # imm = 0x9224
4538; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k2
4539; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm1, %ymm8, %ymm6 {%k2}
4540; AVX512DQ-BW-FCP-NEXT:    movw $18724, %r10w # imm = 0x4924
4541; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k1
4542; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm0, %ymm4, %ymm7 {%k1}
4543; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u]
4544; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm9
4545; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12,u,u,u,u,u]
4546; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm5, %xmm5
4547; AVX512DQ-BW-FCP-NEXT:    movl $4192256, %r10d # imm = 0x3FF800
4548; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k3
4549; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
4550; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm3
4551; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm10 {%k1}
4552; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
4553; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[0,6,12],zero,zero,zero,xmm11[4,10]
4554; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,4,10],zero,zero,zero,xmm10[2,8,14],zero,zero
4555; AVX512DQ-BW-FCP-NEXT:    vpor %xmm12, %xmm13, %xmm12
4556; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
4557; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm12 = ymm5[0,1,2],ymm12[3,4,5,6,7],ymm5[8,9,10],ymm12[11,12,13,14,15]
4558; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7]
4559; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u]
4560; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13,u,u,u,u,u]
4561; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm9, %xmm7
4562; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u]
4563; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[1,7,13],zero,zero,zero,xmm11[5,11]
4564; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,5,11],zero,zero,zero,xmm10[3,9,15],zero,zero
4565; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm9, %xmm6
4566; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
4567; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
4568; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
4569; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm8, %ymm1, %ymm9 {%k2}
4570; AVX512DQ-BW-FCP-NEXT:    movw $9362, %di # imm = 0x2492
4571; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k3
4572; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm4, %ymm0, %ymm10 {%k3}
4573; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
4574; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u]
4575; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u]
4576; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm12, %xmm7
4577; AVX512DQ-BW-FCP-NEXT:    movl $2095104, %edi # imm = 0x1FF800
4578; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k4
4579; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u]
4580; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm12 {%k1}
4581; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero
4582; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm14
4583; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[2,8,14],zero,zero,xmm14[0,6,12]
4584; AVX512DQ-BW-FCP-NEXT:    vpor %xmm13, %xmm15, %xmm13
4585; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
4586; AVX512DQ-BW-FCP-NEXT:    movl $-2097152, %edi # imm = 0xFFE00000
4587; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k2
4588; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm13, %ymm7 {%k2}
4589; AVX512DQ-BW-FCP-NEXT:    movw $9289, %di # imm = 0x2449
4590; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k5
4591; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm8, %ymm1 {%k5}
4592; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u]
4593; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u]
4594; AVX512DQ-BW-FCP-NEXT:    vpor %xmm8, %xmm10, %xmm8
4595; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u]
4596; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero
4597; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[3,9,15],zero,zero,xmm14[1,7,13]
4598; AVX512DQ-BW-FCP-NEXT:    vpor %xmm9, %xmm10, %xmm9
4599; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
4600; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm9, %ymm8 {%k2}
4601; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
4602; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm4, %ymm0 {%k1}
4603; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm4
4604; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u]
4605; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u]
4606; AVX512DQ-BW-FCP-NEXT:    vpor %xmm10, %xmm11, %xmm10
4607; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm9[5,6,7]
4608; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
4609; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm2, %ymm3 {%k3}
4610; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm2
4611; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[2,8,14]
4612; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,2,8,14],zero,zero,xmm3[0,6,12],zero,zero,zero
4613; AVX512DQ-BW-FCP-NEXT:    vpor %xmm10, %xmm11, %xmm10
4614; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
4615; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm10, %ymm9 {%k2}
4616; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u]
4617; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u]
4618; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u]
4619; AVX512DQ-BW-FCP-NEXT:    vpor %xmm4, %xmm0, %xmm0
4620; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
4621; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4622; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[3,9,15]
4623; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,3,9,15],zero,zero,xmm3[1,7,13],zero,zero,zero
4624; AVX512DQ-BW-FCP-NEXT:    vpor %xmm1, %xmm2, %xmm1
4625; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
4626; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k2}
4627; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm5, (%rsi)
4628; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm6, (%rdx)
4629; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm7, (%rcx)
4630; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm8, (%r8)
4631; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm9, (%r9)
4632; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, (%rax)
4633; AVX512DQ-BW-FCP-NEXT:    vzeroupper
4634; AVX512DQ-BW-FCP-NEXT:    retq
4635  %wide.vec = load <192 x i8>, ptr %in.vec, align 64
4636  %strided.vec0 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <32 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90, i32 96, i32 102, i32 108, i32 114, i32 120, i32 126, i32 132, i32 138, i32 144, i32 150, i32 156, i32 162, i32 168, i32 174, i32 180, i32 186>
4637  %strided.vec1 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <32 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43, i32 49, i32 55, i32 61, i32 67, i32 73, i32 79, i32 85, i32 91, i32 97, i32 103, i32 109, i32 115, i32 121, i32 127, i32 133, i32 139, i32 145, i32 151, i32 157, i32 163, i32 169, i32 175, i32 181, i32 187>
4638  %strided.vec2 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <32 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44, i32 50, i32 56, i32 62, i32 68, i32 74, i32 80, i32 86, i32 92, i32 98, i32 104, i32 110, i32 116, i32 122, i32 128, i32 134, i32 140, i32 146, i32 152, i32 158, i32 164, i32 170, i32 176, i32 182, i32 188>
4639  %strided.vec3 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <32 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45, i32 51, i32 57, i32 63, i32 69, i32 75, i32 81, i32 87, i32 93, i32 99, i32 105, i32 111, i32 117, i32 123, i32 129, i32 135, i32 141, i32 147, i32 153, i32 159, i32 165, i32 171, i32 177, i32 183, i32 189>
4640  %strided.vec4 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <32 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46, i32 52, i32 58, i32 64, i32 70, i32 76, i32 82, i32 88, i32 94, i32 100, i32 106, i32 112, i32 118, i32 124, i32 130, i32 136, i32 142, i32 148, i32 154, i32 160, i32 166, i32 172, i32 178, i32 184, i32 190>
4641  %strided.vec5 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <32 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47, i32 53, i32 59, i32 65, i32 71, i32 77, i32 83, i32 89, i32 95, i32 101, i32 107, i32 113, i32 119, i32 125, i32 131, i32 137, i32 143, i32 149, i32 155, i32 161, i32 167, i32 173, i32 179, i32 185, i32 191>
4642  store <32 x i8> %strided.vec0, ptr %out.vec0, align 64
4643  store <32 x i8> %strided.vec1, ptr %out.vec1, align 64
4644  store <32 x i8> %strided.vec2, ptr %out.vec2, align 64
4645  store <32 x i8> %strided.vec3, ptr %out.vec3, align 64
4646  store <32 x i8> %strided.vec4, ptr %out.vec4, align 64
4647  store <32 x i8> %strided.vec5, ptr %out.vec5, align 64
4648  ret void
4649}
4650
4651define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
4652; SSE-LABEL: load_i8_stride6_vf64:
4653; SSE:       # %bb.0:
4654; SSE-NEXT:    subq $792, %rsp # imm = 0x318
4655; SSE-NEXT:    movdqa 64(%rdi), %xmm4
4656; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4657; SSE-NEXT:    movdqa 80(%rdi), %xmm5
4658; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4659; SSE-NEXT:    movdqa (%rdi), %xmm7
4660; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4661; SSE-NEXT:    movdqa 16(%rdi), %xmm6
4662; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4663; SSE-NEXT:    movdqa 32(%rdi), %xmm2
4664; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4665; SSE-NEXT:    movdqa 48(%rdi), %xmm0
4666; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [65535,65535,0,65535,65535,0,65535,65535]
4667; SSE-NEXT:    movdqa %xmm13, %xmm1
4668; SSE-NEXT:    pandn %xmm2, %xmm1
4669; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,0,65535,65535,0]
4670; SSE-NEXT:    movdqa %xmm3, %xmm2
4671; SSE-NEXT:    pandn %xmm0, %xmm2
4672; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4673; SSE-NEXT:    movdqa %xmm13, %xmm2
4674; SSE-NEXT:    pandn %xmm0, %xmm2
4675; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4676; SSE-NEXT:    pand %xmm13, %xmm0
4677; SSE-NEXT:    por %xmm1, %xmm0
4678; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4679; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255]
4680; SSE-NEXT:    pand %xmm10, %xmm0
4681; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
4682; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
4683; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
4684; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
4685; SSE-NEXT:    packuswb %xmm1, %xmm0
4686; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535]
4687; SSE-NEXT:    movdqa %xmm3, %xmm1
4688; SSE-NEXT:    pandn %xmm6, %xmm1
4689; SSE-NEXT:    movdqa %xmm7, %xmm2
4690; SSE-NEXT:    pand %xmm3, %xmm2
4691; SSE-NEXT:    por %xmm1, %xmm2
4692; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4693; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3]
4694; SSE-NEXT:    pand %xmm10, %xmm1
4695; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
4696; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
4697; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
4698; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
4699; SSE-NEXT:    packuswb %xmm1, %xmm1
4700; SSE-NEXT:    pand %xmm8, %xmm1
4701; SSE-NEXT:    movdqa %xmm8, %xmm2
4702; SSE-NEXT:    pandn %xmm0, %xmm2
4703; SSE-NEXT:    por %xmm2, %xmm1
4704; SSE-NEXT:    movdqa %xmm13, %xmm0
4705; SSE-NEXT:    pandn %xmm5, %xmm0
4706; SSE-NEXT:    pand %xmm13, %xmm4
4707; SSE-NEXT:    por %xmm0, %xmm4
4708; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4709; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[3,1,2,0]
4710; SSE-NEXT:    pand %xmm10, %xmm0
4711; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
4712; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0]
4713; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
4714; SSE-NEXT:    packuswb %xmm0, %xmm0
4715; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
4716; SSE-NEXT:    movdqa %xmm4, %xmm2
4717; SSE-NEXT:    pandn %xmm0, %xmm2
4718; SSE-NEXT:    pand %xmm4, %xmm1
4719; SSE-NEXT:    por %xmm1, %xmm2
4720; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4721; SSE-NEXT:    movdqa 320(%rdi), %xmm1
4722; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4723; SSE-NEXT:    movdqa %xmm13, %xmm0
4724; SSE-NEXT:    pandn %xmm1, %xmm0
4725; SSE-NEXT:    movdqa 336(%rdi), %xmm12
4726; SSE-NEXT:    movdqa %xmm3, %xmm1
4727; SSE-NEXT:    pandn %xmm12, %xmm1
4728; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4729; SSE-NEXT:    movdqa %xmm13, %xmm1
4730; SSE-NEXT:    pandn %xmm12, %xmm1
4731; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4732; SSE-NEXT:    pand %xmm13, %xmm12
4733; SSE-NEXT:    por %xmm0, %xmm12
4734; SSE-NEXT:    movdqa %xmm12, %xmm0
4735; SSE-NEXT:    pand %xmm10, %xmm0
4736; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
4737; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
4738; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
4739; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
4740; SSE-NEXT:    packuswb %xmm1, %xmm0
4741; SSE-NEXT:    movdqa %xmm8, %xmm1
4742; SSE-NEXT:    pandn %xmm0, %xmm1
4743; SSE-NEXT:    movdqa 304(%rdi), %xmm2
4744; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4745; SSE-NEXT:    movdqa %xmm3, %xmm7
4746; SSE-NEXT:    movdqa %xmm3, %xmm0
4747; SSE-NEXT:    pandn %xmm2, %xmm0
4748; SSE-NEXT:    movdqa 288(%rdi), %xmm6
4749; SSE-NEXT:    movdqa %xmm6, %xmm2
4750; SSE-NEXT:    pand %xmm3, %xmm2
4751; SSE-NEXT:    por %xmm0, %xmm2
4752; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4753; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3]
4754; SSE-NEXT:    pand %xmm10, %xmm0
4755; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
4756; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4757; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
4758; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
4759; SSE-NEXT:    packuswb %xmm0, %xmm0
4760; SSE-NEXT:    pand %xmm8, %xmm0
4761; SSE-NEXT:    por %xmm1, %xmm0
4762; SSE-NEXT:    movdqa 368(%rdi), %xmm1
4763; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4764; SSE-NEXT:    movdqa %xmm13, %xmm2
4765; SSE-NEXT:    pandn %xmm1, %xmm2
4766; SSE-NEXT:    movdqa 352(%rdi), %xmm3
4767; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4768; SSE-NEXT:    pand %xmm13, %xmm3
4769; SSE-NEXT:    por %xmm2, %xmm3
4770; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4771; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[3,1,2,0]
4772; SSE-NEXT:    pand %xmm10, %xmm2
4773; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
4774; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,0]
4775; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
4776; SSE-NEXT:    packuswb %xmm2, %xmm2
4777; SSE-NEXT:    movdqa %xmm4, %xmm3
4778; SSE-NEXT:    pandn %xmm2, %xmm3
4779; SSE-NEXT:    pand %xmm4, %xmm0
4780; SSE-NEXT:    movdqa %xmm4, %xmm9
4781; SSE-NEXT:    por %xmm0, %xmm3
4782; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4783; SSE-NEXT:    movdqa 224(%rdi), %xmm1
4784; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4785; SSE-NEXT:    movdqa %xmm13, %xmm0
4786; SSE-NEXT:    pandn %xmm1, %xmm0
4787; SSE-NEXT:    movdqa 240(%rdi), %xmm11
4788; SSE-NEXT:    movdqa %xmm7, %xmm2
4789; SSE-NEXT:    pandn %xmm11, %xmm2
4790; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4791; SSE-NEXT:    movdqa %xmm13, %xmm2
4792; SSE-NEXT:    pandn %xmm11, %xmm2
4793; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4794; SSE-NEXT:    pand %xmm13, %xmm11
4795; SSE-NEXT:    por %xmm0, %xmm11
4796; SSE-NEXT:    movdqa %xmm11, %xmm0
4797; SSE-NEXT:    pand %xmm10, %xmm0
4798; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,3,2,3,4,5,6,7]
4799; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
4800; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
4801; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
4802; SSE-NEXT:    packuswb %xmm2, %xmm0
4803; SSE-NEXT:    movdqa %xmm8, %xmm2
4804; SSE-NEXT:    pandn %xmm0, %xmm2
4805; SSE-NEXT:    movdqa 208(%rdi), %xmm1
4806; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4807; SSE-NEXT:    movdqa %xmm7, %xmm0
4808; SSE-NEXT:    pandn %xmm1, %xmm0
4809; SSE-NEXT:    movdqa 192(%rdi), %xmm3
4810; SSE-NEXT:    movdqa %xmm3, %xmm1
4811; SSE-NEXT:    pand %xmm7, %xmm1
4812; SSE-NEXT:    por %xmm0, %xmm1
4813; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4814; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,1,3]
4815; SSE-NEXT:    movdqa %xmm10, %xmm1
4816; SSE-NEXT:    pand %xmm10, %xmm0
4817; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
4818; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
4819; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
4820; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
4821; SSE-NEXT:    packuswb %xmm0, %xmm0
4822; SSE-NEXT:    pand %xmm8, %xmm0
4823; SSE-NEXT:    movdqa %xmm8, %xmm10
4824; SSE-NEXT:    por %xmm2, %xmm0
4825; SSE-NEXT:    movdqa 272(%rdi), %xmm14
4826; SSE-NEXT:    movdqa %xmm13, %xmm2
4827; SSE-NEXT:    pandn %xmm14, %xmm2
4828; SSE-NEXT:    movdqa 256(%rdi), %xmm15
4829; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4830; SSE-NEXT:    pand %xmm13, %xmm15
4831; SSE-NEXT:    por %xmm2, %xmm15
4832; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm15[3,1,2,0]
4833; SSE-NEXT:    pand %xmm1, %xmm2
4834; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
4835; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,0]
4836; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
4837; SSE-NEXT:    packuswb %xmm2, %xmm2
4838; SSE-NEXT:    pandn %xmm2, %xmm4
4839; SSE-NEXT:    pand %xmm9, %xmm0
4840; SSE-NEXT:    por %xmm0, %xmm4
4841; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4842; SSE-NEXT:    movdqa 128(%rdi), %xmm2
4843; SSE-NEXT:    movdqa %xmm2, (%rsp) # 16-byte Spill
4844; SSE-NEXT:    movdqa %xmm13, %xmm0
4845; SSE-NEXT:    pandn %xmm2, %xmm0
4846; SSE-NEXT:    movdqa 144(%rdi), %xmm9
4847; SSE-NEXT:    movdqa %xmm7, %xmm4
4848; SSE-NEXT:    pandn %xmm9, %xmm4
4849; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4850; SSE-NEXT:    movdqa %xmm13, %xmm4
4851; SSE-NEXT:    pandn %xmm9, %xmm4
4852; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4853; SSE-NEXT:    pand %xmm13, %xmm9
4854; SSE-NEXT:    por %xmm0, %xmm9
4855; SSE-NEXT:    movdqa %xmm9, %xmm0
4856; SSE-NEXT:    pand %xmm1, %xmm0
4857; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm0[0,3,2,3,4,5,6,7]
4858; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
4859; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
4860; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
4861; SSE-NEXT:    packuswb %xmm5, %xmm0
4862; SSE-NEXT:    pandn %xmm0, %xmm10
4863; SSE-NEXT:    movdqa %xmm13, %xmm0
4864; SSE-NEXT:    movdqa %xmm13, %xmm2
4865; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4866; SSE-NEXT:    pandn %xmm1, %xmm2
4867; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4868; SSE-NEXT:    movdqa %xmm13, %xmm2
4869; SSE-NEXT:    pandn %xmm6, %xmm2
4870; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4871; SSE-NEXT:    movdqa %xmm6, %xmm5
4872; SSE-NEXT:    movdqa %xmm13, %xmm2
4873; SSE-NEXT:    pandn %xmm3, %xmm2
4874; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4875; SSE-NEXT:    movdqa %xmm3, %xmm4
4876; SSE-NEXT:    movdqa 112(%rdi), %xmm6
4877; SSE-NEXT:    movdqa %xmm7, %xmm2
4878; SSE-NEXT:    movdqa %xmm7, %xmm8
4879; SSE-NEXT:    pandn %xmm6, %xmm8
4880; SSE-NEXT:    movdqa 160(%rdi), %xmm7
4881; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4882; SSE-NEXT:    pand %xmm13, %xmm7
4883; SSE-NEXT:    movdqa %xmm13, %xmm3
4884; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4885; SSE-NEXT:    pandn %xmm13, %xmm3
4886; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4887; SSE-NEXT:    pand %xmm0, %xmm1
4888; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4889; SSE-NEXT:    movdqa %xmm2, %xmm3
4890; SSE-NEXT:    movdqa %xmm2, %xmm1
4891; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4892; SSE-NEXT:    pandn %xmm2, %xmm3
4893; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4894; SSE-NEXT:    pand %xmm0, %xmm2
4895; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4896; SSE-NEXT:    movdqa %xmm0, %xmm2
4897; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4898; SSE-NEXT:    pandn %xmm3, %xmm2
4899; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4900; SSE-NEXT:    pand %xmm0, %xmm5
4901; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4902; SSE-NEXT:    movdqa %xmm1, %xmm5
4903; SSE-NEXT:    movdqa %xmm1, %xmm2
4904; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4905; SSE-NEXT:    pandn %xmm1, %xmm2
4906; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4907; SSE-NEXT:    pand %xmm0, %xmm1
4908; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4909; SSE-NEXT:    movdqa %xmm0, %xmm2
4910; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4911; SSE-NEXT:    pandn %xmm1, %xmm2
4912; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4913; SSE-NEXT:    pand %xmm0, %xmm4
4914; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4915; SSE-NEXT:    pandn %xmm14, %xmm5
4916; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4917; SSE-NEXT:    pand %xmm0, %xmm14
4918; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4919; SSE-NEXT:    movdqa %xmm0, %xmm2
4920; SSE-NEXT:    pandn %xmm6, %xmm2
4921; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4922; SSE-NEXT:    movdqa 96(%rdi), %xmm4
4923; SSE-NEXT:    movdqa %xmm4, %xmm2
4924; SSE-NEXT:    pand %xmm0, %xmm2
4925; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4926; SSE-NEXT:    movdqa 176(%rdi), %xmm14
4927; SSE-NEXT:    movdqa %xmm14, %xmm2
4928; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4929; SSE-NEXT:    pand %xmm0, %xmm2
4930; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4931; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4932; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4933; SSE-NEXT:    pand %xmm0, %xmm2
4934; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4935; SSE-NEXT:    pand %xmm0, %xmm13
4936; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4937; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4938; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4939; SSE-NEXT:    pand %xmm0, %xmm2
4940; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4941; SSE-NEXT:    pand %xmm0, %xmm3
4942; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4943; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4944; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4945; SSE-NEXT:    pand %xmm0, %xmm2
4946; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4947; SSE-NEXT:    pand %xmm0, %xmm1
4948; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4949; SSE-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
4950; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4951; SSE-NEXT:    pand %xmm0, %xmm1
4952; SSE-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
4953; SSE-NEXT:    movdqa %xmm0, %xmm1
4954; SSE-NEXT:    pand %xmm0, %xmm6
4955; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4956; SSE-NEXT:    movdqa %xmm0, %xmm13
4957; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4958; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4959; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4960; SSE-NEXT:    pandn %xmm4, %xmm1
4961; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4962; SSE-NEXT:    movdqa %xmm4, %xmm3
4963; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
4964; SSE-NEXT:    por %xmm8, %xmm3
4965; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,2,1,3]
4966; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
4967; SSE-NEXT:    pand %xmm1, %xmm5
4968; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
4969; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3]
4970; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7]
4971; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
4972; SSE-NEXT:    packuswb %xmm5, %xmm5
4973; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535]
4974; SSE-NEXT:    pand %xmm8, %xmm5
4975; SSE-NEXT:    por %xmm10, %xmm5
4976; SSE-NEXT:    pandn %xmm14, %xmm0
4977; SSE-NEXT:    por %xmm0, %xmm7
4978; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[3,1,2,0]
4979; SSE-NEXT:    pand %xmm1, %xmm0
4980; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
4981; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0]
4982; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
4983; SSE-NEXT:    packuswb %xmm0, %xmm0
4984; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
4985; SSE-NEXT:    movdqa %xmm10, %xmm1
4986; SSE-NEXT:    pandn %xmm0, %xmm1
4987; SSE-NEXT:    pand %xmm10, %xmm5
4988; SSE-NEXT:    por %xmm5, %xmm1
4989; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4990; SSE-NEXT:    pxor %xmm5, %xmm5
4991; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4992; SSE-NEXT:    movdqa %xmm1, %xmm0
4993; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
4994; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
4995; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,2,3,3]
4996; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
4997; SSE-NEXT:    psrld $16, %xmm0
4998; SSE-NEXT:    pshufd {{.*#+}} xmm14 = xmm1[0,1,0,3]
4999; SSE-NEXT:    pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,7,6,7]
5000; SSE-NEXT:    punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3]
5001; SSE-NEXT:    packuswb %xmm14, %xmm4
5002; SSE-NEXT:    movdqa %xmm8, %xmm1
5003; SSE-NEXT:    pandn %xmm4, %xmm1
5004; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5005; SSE-NEXT:    movdqa %xmm2, %xmm4
5006; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
5007; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3]
5008; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7]
5009; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
5010; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,0,0,65535,65535]
5011; SSE-NEXT:    movdqa %xmm0, %xmm14
5012; SSE-NEXT:    pandn %xmm4, %xmm14
5013; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
5014; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[3,1,2,3,4,5,6,7]
5015; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
5016; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,0,4,5,6,7]
5017; SSE-NEXT:    pand %xmm0, %xmm4
5018; SSE-NEXT:    por %xmm14, %xmm4
5019; SSE-NEXT:    packuswb %xmm4, %xmm4
5020; SSE-NEXT:    pand %xmm8, %xmm4
5021; SSE-NEXT:    por %xmm1, %xmm4
5022; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5023; SSE-NEXT:    movdqa %xmm6, %xmm1
5024; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
5025; SSE-NEXT:    pshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,5,5,5,5]
5026; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,0,65535]
5027; SSE-NEXT:    movdqa %xmm2, %xmm1
5028; SSE-NEXT:    pandn %xmm14, %xmm1
5029; SSE-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
5030; SSE-NEXT:    pshuflw {{.*#+}} xmm14 = xmm6[3,1,2,3,4,5,6,7]
5031; SSE-NEXT:    pshufd {{.*#+}} xmm14 = xmm14[0,1,0,3]
5032; SSE-NEXT:    pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,7,6,4]
5033; SSE-NEXT:    pand %xmm2, %xmm14
5034; SSE-NEXT:    por %xmm1, %xmm14
5035; SSE-NEXT:    packuswb %xmm14, %xmm1
5036; SSE-NEXT:    movdqa %xmm10, %xmm14
5037; SSE-NEXT:    pandn %xmm1, %xmm14
5038; SSE-NEXT:    pand %xmm10, %xmm4
5039; SSE-NEXT:    por %xmm4, %xmm14
5040; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5041; SSE-NEXT:    movdqa %xmm12, %xmm1
5042; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
5043; SSE-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
5044; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm12[2,2,3,3]
5045; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
5046; SSE-NEXT:    psrld $16, %xmm1
5047; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[0,1,0,3]
5048; SSE-NEXT:    pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,7,6,7]
5049; SSE-NEXT:    punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3]
5050; SSE-NEXT:    packuswb %xmm12, %xmm4
5051; SSE-NEXT:    movdqa %xmm8, %xmm14
5052; SSE-NEXT:    movdqa %xmm8, %xmm1
5053; SSE-NEXT:    pandn %xmm4, %xmm1
5054; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5055; SSE-NEXT:    movdqa %xmm6, %xmm4
5056; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
5057; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3]
5058; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7]
5059; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
5060; SSE-NEXT:    movdqa %xmm0, %xmm12
5061; SSE-NEXT:    pandn %xmm4, %xmm12
5062; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
5063; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7]
5064; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
5065; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,0,4,5,6,7]
5066; SSE-NEXT:    pand %xmm0, %xmm4
5067; SSE-NEXT:    por %xmm12, %xmm4
5068; SSE-NEXT:    packuswb %xmm4, %xmm4
5069; SSE-NEXT:    pand %xmm8, %xmm4
5070; SSE-NEXT:    por %xmm1, %xmm4
5071; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5072; SSE-NEXT:    movdqa %xmm6, %xmm1
5073; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
5074; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
5075; SSE-NEXT:    movdqa %xmm2, %xmm12
5076; SSE-NEXT:    pandn %xmm1, %xmm12
5077; SSE-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
5078; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm6[3,1,2,3,4,5,6,7]
5079; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
5080; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4]
5081; SSE-NEXT:    pand %xmm2, %xmm1
5082; SSE-NEXT:    por %xmm12, %xmm1
5083; SSE-NEXT:    packuswb %xmm1, %xmm1
5084; SSE-NEXT:    movdqa %xmm10, %xmm12
5085; SSE-NEXT:    pandn %xmm1, %xmm12
5086; SSE-NEXT:    pand %xmm10, %xmm4
5087; SSE-NEXT:    por %xmm4, %xmm12
5088; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5089; SSE-NEXT:    movdqa %xmm11, %xmm1
5090; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
5091; SSE-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7]
5092; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm11[2,2,3,3]
5093; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
5094; SSE-NEXT:    psrld $16, %xmm1
5095; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm11[0,1,0,3]
5096; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7]
5097; SSE-NEXT:    punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3]
5098; SSE-NEXT:    packuswb %xmm8, %xmm4
5099; SSE-NEXT:    movdqa %xmm14, %xmm1
5100; SSE-NEXT:    pandn %xmm4, %xmm1
5101; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5102; SSE-NEXT:    movdqa %xmm6, %xmm4
5103; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
5104; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3]
5105; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7]
5106; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
5107; SSE-NEXT:    movdqa %xmm0, %xmm8
5108; SSE-NEXT:    pandn %xmm4, %xmm8
5109; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
5110; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7]
5111; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
5112; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,0,4,5,6,7]
5113; SSE-NEXT:    pand %xmm0, %xmm4
5114; SSE-NEXT:    por %xmm8, %xmm4
5115; SSE-NEXT:    packuswb %xmm4, %xmm4
5116; SSE-NEXT:    pand %xmm14, %xmm4
5117; SSE-NEXT:    por %xmm1, %xmm4
5118; SSE-NEXT:    movdqa %xmm15, %xmm1
5119; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
5120; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
5121; SSE-NEXT:    movdqa %xmm2, %xmm8
5122; SSE-NEXT:    pandn %xmm1, %xmm8
5123; SSE-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm5[8],xmm15[9],xmm5[9],xmm15[10],xmm5[10],xmm15[11],xmm5[11],xmm15[12],xmm5[12],xmm15[13],xmm5[13],xmm15[14],xmm5[14],xmm15[15],xmm5[15]
5124; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm15[3,1,2,3,4,5,6,7]
5125; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
5126; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4]
5127; SSE-NEXT:    pand %xmm2, %xmm1
5128; SSE-NEXT:    por %xmm8, %xmm1
5129; SSE-NEXT:    packuswb %xmm1, %xmm1
5130; SSE-NEXT:    movdqa %xmm10, %xmm8
5131; SSE-NEXT:    pandn %xmm1, %xmm8
5132; SSE-NEXT:    pand %xmm10, %xmm4
5133; SSE-NEXT:    por %xmm4, %xmm8
5134; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5135; SSE-NEXT:    movdqa %xmm9, %xmm1
5136; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
5137; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7]
5138; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[2,2,3,3]
5139; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
5140; SSE-NEXT:    psrld $16, %xmm1
5141; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm9[0,1,0,3]
5142; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7]
5143; SSE-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3]
5144; SSE-NEXT:    packuswb %xmm6, %xmm4
5145; SSE-NEXT:    movdqa %xmm3, %xmm1
5146; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
5147; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
5148; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7]
5149; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7]
5150; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
5151; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
5152; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
5153; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[1,3,2,0,4,5,6,7]
5154; SSE-NEXT:    pand %xmm0, %xmm3
5155; SSE-NEXT:    pandn %xmm1, %xmm0
5156; SSE-NEXT:    por %xmm3, %xmm0
5157; SSE-NEXT:    packuswb %xmm0, %xmm0
5158; SSE-NEXT:    movdqa %xmm14, %xmm1
5159; SSE-NEXT:    pand %xmm14, %xmm0
5160; SSE-NEXT:    pandn %xmm4, %xmm1
5161; SSE-NEXT:    por %xmm1, %xmm0
5162; SSE-NEXT:    movdqa %xmm7, %xmm1
5163; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
5164; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
5165; SSE-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
5166; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7]
5167; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
5168; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,4]
5169; SSE-NEXT:    pand %xmm2, %xmm3
5170; SSE-NEXT:    pandn %xmm1, %xmm2
5171; SSE-NEXT:    por %xmm3, %xmm2
5172; SSE-NEXT:    packuswb %xmm2, %xmm1
5173; SSE-NEXT:    movdqa %xmm10, %xmm2
5174; SSE-NEXT:    pandn %xmm1, %xmm2
5175; SSE-NEXT:    pand %xmm10, %xmm0
5176; SSE-NEXT:    por %xmm0, %xmm2
5177; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5178; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,65535,0]
5179; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5180; SSE-NEXT:    pand %xmm7, %xmm4
5181; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5182; SSE-NEXT:    movdqa %xmm4, %xmm0
5183; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255]
5184; SSE-NEXT:    pand %xmm12, %xmm0
5185; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7]
5186; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
5187; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7]
5188; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
5189; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
5190; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6]
5191; SSE-NEXT:    packuswb %xmm1, %xmm0
5192; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5193; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
5194; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm6[2,1,2,3,4,5,6,7]
5195; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
5196; SSE-NEXT:    pand %xmm12, %xmm1
5197; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
5198; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
5199; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,5,5,5,5]
5200; SSE-NEXT:    packuswb %xmm2, %xmm2
5201; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
5202; SSE-NEXT:    movdqa %xmm5, %xmm3
5203; SSE-NEXT:    pandn %xmm2, %xmm3
5204; SSE-NEXT:    pand %xmm5, %xmm0
5205; SSE-NEXT:    por %xmm0, %xmm3
5206; SSE-NEXT:    movdqa %xmm13, %xmm0
5207; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
5208; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5209; SSE-NEXT:    por %xmm0, %xmm1
5210; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5211; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7]
5212; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
5213; SSE-NEXT:    pand %xmm12, %xmm0
5214; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
5215; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
5216; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
5217; SSE-NEXT:    packuswb %xmm0, %xmm0
5218; SSE-NEXT:    movdqa %xmm10, %xmm2
5219; SSE-NEXT:    pandn %xmm0, %xmm2
5220; SSE-NEXT:    pand %xmm10, %xmm3
5221; SSE-NEXT:    por %xmm3, %xmm2
5222; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5223; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5224; SSE-NEXT:    pand %xmm7, %xmm13
5225; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
5226; SSE-NEXT:    movdqa %xmm13, %xmm0
5227; SSE-NEXT:    pand %xmm12, %xmm0
5228; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,7,6,7]
5229; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
5230; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7]
5231; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
5232; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
5233; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6]
5234; SSE-NEXT:    packuswb %xmm2, %xmm0
5235; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5236; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
5237; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm14[2,1,2,3,4,5,6,7]
5238; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
5239; SSE-NEXT:    pand %xmm12, %xmm2
5240; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
5241; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
5242; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
5243; SSE-NEXT:    packuswb %xmm2, %xmm2
5244; SSE-NEXT:    movdqa %xmm5, %xmm3
5245; SSE-NEXT:    pandn %xmm2, %xmm3
5246; SSE-NEXT:    pand %xmm5, %xmm0
5247; SSE-NEXT:    por %xmm0, %xmm3
5248; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5249; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
5250; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
5251; SSE-NEXT:    por %xmm0, %xmm11
5252; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm11[0,3,2,3,4,5,6,7]
5253; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
5254; SSE-NEXT:    pand %xmm12, %xmm0
5255; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
5256; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
5257; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
5258; SSE-NEXT:    packuswb %xmm0, %xmm0
5259; SSE-NEXT:    movdqa %xmm10, %xmm2
5260; SSE-NEXT:    pandn %xmm0, %xmm2
5261; SSE-NEXT:    pand %xmm10, %xmm3
5262; SSE-NEXT:    movdqa %xmm10, %xmm9
5263; SSE-NEXT:    por %xmm3, %xmm2
5264; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5265; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5266; SSE-NEXT:    pand %xmm7, %xmm10
5267; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
5268; SSE-NEXT:    movdqa %xmm10, %xmm0
5269; SSE-NEXT:    pand %xmm12, %xmm0
5270; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,7,6,7]
5271; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
5272; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7]
5273; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
5274; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
5275; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6]
5276; SSE-NEXT:    packuswb %xmm2, %xmm0
5277; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5278; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5279; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5280; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,1,2,3,4,5,6,7]
5281; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
5282; SSE-NEXT:    pand %xmm12, %xmm2
5283; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
5284; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
5285; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
5286; SSE-NEXT:    packuswb %xmm2, %xmm2
5287; SSE-NEXT:    movdqa %xmm5, %xmm3
5288; SSE-NEXT:    pandn %xmm2, %xmm3
5289; SSE-NEXT:    pand %xmm5, %xmm0
5290; SSE-NEXT:    por %xmm0, %xmm3
5291; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5292; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
5293; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5294; SSE-NEXT:    por %xmm0, %xmm8
5295; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm8[0,3,2,3,4,5,6,7]
5296; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
5297; SSE-NEXT:    pand %xmm12, %xmm0
5298; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
5299; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
5300; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
5301; SSE-NEXT:    packuswb %xmm0, %xmm0
5302; SSE-NEXT:    movdqa %xmm9, %xmm2
5303; SSE-NEXT:    pandn %xmm0, %xmm2
5304; SSE-NEXT:    pand %xmm9, %xmm3
5305; SSE-NEXT:    movdqa %xmm9, %xmm1
5306; SSE-NEXT:    por %xmm3, %xmm2
5307; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5308; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5309; SSE-NEXT:    pand %xmm7, %xmm0
5310; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
5311; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5312; SSE-NEXT:    pand %xmm12, %xmm0
5313; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,7,6,7]
5314; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
5315; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7]
5316; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
5317; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
5318; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6]
5319; SSE-NEXT:    packuswb %xmm2, %xmm0
5320; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5321; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
5322; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm15[2,1,2,3,4,5,6,7]
5323; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
5324; SSE-NEXT:    pand %xmm12, %xmm2
5325; SSE-NEXT:    movdqa %xmm12, %xmm9
5326; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
5327; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
5328; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
5329; SSE-NEXT:    packuswb %xmm2, %xmm2
5330; SSE-NEXT:    movdqa %xmm5, %xmm3
5331; SSE-NEXT:    pandn %xmm2, %xmm3
5332; SSE-NEXT:    pand %xmm5, %xmm0
5333; SSE-NEXT:    por %xmm0, %xmm3
5334; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5335; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
5336; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
5337; SSE-NEXT:    por %xmm0, %xmm12
5338; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm12[0,3,2,3,4,5,6,7]
5339; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
5340; SSE-NEXT:    pand %xmm9, %xmm0
5341; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
5342; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
5343; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
5344; SSE-NEXT:    packuswb %xmm0, %xmm0
5345; SSE-NEXT:    movdqa %xmm1, %xmm9
5346; SSE-NEXT:    movdqa %xmm1, %xmm2
5347; SSE-NEXT:    pandn %xmm0, %xmm2
5348; SSE-NEXT:    pand %xmm1, %xmm3
5349; SSE-NEXT:    por %xmm3, %xmm2
5350; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5351; SSE-NEXT:    movdqa %xmm4, %xmm0
5352; SSE-NEXT:    pxor %xmm1, %xmm1
5353; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
5354; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
5355; SSE-NEXT:    pxor %xmm7, %xmm7
5356; SSE-NEXT:    movdqa %xmm4, %xmm2
5357; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0]
5358; SSE-NEXT:    movaps %xmm0, %xmm3
5359; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2]
5360; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[0,0]
5361; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[2,3]
5362; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7]
5363; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
5364; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
5365; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
5366; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
5367; SSE-NEXT:    packuswb %xmm0, %xmm2
5368; SSE-NEXT:    movdqa %xmm6, %xmm0
5369; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
5370; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
5371; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
5372; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,65535,65535,0,65535,65535,65535,65535]
5373; SSE-NEXT:    movdqa %xmm3, %xmm4
5374; SSE-NEXT:    pandn %xmm0, %xmm4
5375; SSE-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
5376; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,3,2,1]
5377; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
5378; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
5379; SSE-NEXT:    pand %xmm3, %xmm0
5380; SSE-NEXT:    por %xmm4, %xmm0
5381; SSE-NEXT:    packuswb %xmm0, %xmm0
5382; SSE-NEXT:    movdqa %xmm5, %xmm6
5383; SSE-NEXT:    pandn %xmm0, %xmm6
5384; SSE-NEXT:    pand %xmm5, %xmm2
5385; SSE-NEXT:    por %xmm2, %xmm6
5386; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5387; SSE-NEXT:    movdqa %xmm1, %xmm0
5388; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15]
5389; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
5390; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5]
5391; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,65535,65535,0]
5392; SSE-NEXT:    movdqa %xmm4, %xmm2
5393; SSE-NEXT:    pandn %xmm0, %xmm2
5394; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
5395; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,0,3]
5396; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
5397; SSE-NEXT:    pand %xmm4, %xmm0
5398; SSE-NEXT:    por %xmm2, %xmm0
5399; SSE-NEXT:    packuswb %xmm0, %xmm0
5400; SSE-NEXT:    movdqa %xmm9, %xmm2
5401; SSE-NEXT:    pandn %xmm0, %xmm2
5402; SSE-NEXT:    pand %xmm9, %xmm6
5403; SSE-NEXT:    por %xmm6, %xmm2
5404; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5405; SSE-NEXT:    movdqa %xmm13, %xmm0
5406; SSE-NEXT:    pxor %xmm1, %xmm1
5407; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
5408; SSE-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm1[8],xmm13[9],xmm1[9],xmm13[10],xmm1[10],xmm13[11],xmm1[11],xmm13[12],xmm1[12],xmm13[13],xmm1[13],xmm13[14],xmm1[14],xmm13[15],xmm1[15]
5409; SSE-NEXT:    movdqa %xmm13, %xmm2
5410; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0]
5411; SSE-NEXT:    movaps %xmm0, %xmm6
5412; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,1],xmm2[0,2]
5413; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[0,0]
5414; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[2,3]
5415; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,7,5,6,7]
5416; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
5417; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
5418; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
5419; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
5420; SSE-NEXT:    packuswb %xmm0, %xmm2
5421; SSE-NEXT:    movdqa %xmm14, %xmm0
5422; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
5423; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
5424; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
5425; SSE-NEXT:    movdqa %xmm3, %xmm6
5426; SSE-NEXT:    pandn %xmm0, %xmm6
5427; SSE-NEXT:    punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm1[8],xmm14[9],xmm1[9],xmm14[10],xmm1[10],xmm14[11],xmm1[11],xmm14[12],xmm1[12],xmm14[13],xmm1[13],xmm14[14],xmm1[14],xmm14[15],xmm1[15]
5428; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[0,3,2,1]
5429; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
5430; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
5431; SSE-NEXT:    pand %xmm3, %xmm0
5432; SSE-NEXT:    por %xmm6, %xmm0
5433; SSE-NEXT:    packuswb %xmm0, %xmm0
5434; SSE-NEXT:    movdqa %xmm5, %xmm6
5435; SSE-NEXT:    pandn %xmm0, %xmm6
5436; SSE-NEXT:    pand %xmm5, %xmm2
5437; SSE-NEXT:    por %xmm2, %xmm6
5438; SSE-NEXT:    movdqa %xmm11, %xmm0
5439; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
5440; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
5441; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5]
5442; SSE-NEXT:    movdqa %xmm4, %xmm2
5443; SSE-NEXT:    pandn %xmm0, %xmm2
5444; SSE-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3],xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7]
5445; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[0,2,0,3]
5446; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
5447; SSE-NEXT:    pand %xmm4, %xmm0
5448; SSE-NEXT:    por %xmm2, %xmm0
5449; SSE-NEXT:    packuswb %xmm0, %xmm0
5450; SSE-NEXT:    movdqa %xmm9, %xmm2
5451; SSE-NEXT:    pandn %xmm0, %xmm2
5452; SSE-NEXT:    pand %xmm9, %xmm6
5453; SSE-NEXT:    por %xmm6, %xmm2
5454; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5455; SSE-NEXT:    movdqa %xmm10, %xmm0
5456; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
5457; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15]
5458; SSE-NEXT:    movdqa %xmm10, %xmm2
5459; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0]
5460; SSE-NEXT:    movaps %xmm0, %xmm6
5461; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,1],xmm2[0,2]
5462; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0]
5463; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[2,3]
5464; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,7,5,6,7]
5465; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
5466; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
5467; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
5468; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
5469; SSE-NEXT:    packuswb %xmm0, %xmm2
5470; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
5471; SSE-NEXT:    movdqa %xmm7, %xmm0
5472; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
5473; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
5474; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
5475; SSE-NEXT:    movdqa %xmm3, %xmm6
5476; SSE-NEXT:    pandn %xmm0, %xmm6
5477; SSE-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
5478; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,3,2,1]
5479; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
5480; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
5481; SSE-NEXT:    pand %xmm3, %xmm0
5482; SSE-NEXT:    por %xmm6, %xmm0
5483; SSE-NEXT:    packuswb %xmm0, %xmm0
5484; SSE-NEXT:    movdqa %xmm5, %xmm6
5485; SSE-NEXT:    pandn %xmm0, %xmm6
5486; SSE-NEXT:    pand %xmm5, %xmm2
5487; SSE-NEXT:    por %xmm2, %xmm6
5488; SSE-NEXT:    movdqa %xmm8, %xmm0
5489; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
5490; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
5491; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5]
5492; SSE-NEXT:    movdqa %xmm4, %xmm2
5493; SSE-NEXT:    pandn %xmm0, %xmm2
5494; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
5495; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[0,2,0,3]
5496; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
5497; SSE-NEXT:    pand %xmm4, %xmm0
5498; SSE-NEXT:    por %xmm2, %xmm0
5499; SSE-NEXT:    packuswb %xmm0, %xmm2
5500; SSE-NEXT:    movdqa %xmm9, %xmm0
5501; SSE-NEXT:    pandn %xmm2, %xmm0
5502; SSE-NEXT:    pand %xmm9, %xmm6
5503; SSE-NEXT:    por %xmm6, %xmm0
5504; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5505; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5506; SSE-NEXT:    movdqa %xmm0, %xmm2
5507; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
5508; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
5509; SSE-NEXT:    movdqa %xmm0, %xmm6
5510; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[3,0]
5511; SSE-NEXT:    movaps %xmm2, %xmm7
5512; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm6[0,2]
5513; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
5514; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
5515; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,7,5,6,7]
5516; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2]
5517; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
5518; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
5519; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
5520; SSE-NEXT:    packuswb %xmm2, %xmm6
5521; SSE-NEXT:    movdqa %xmm15, %xmm2
5522; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
5523; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
5524; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7]
5525; SSE-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15]
5526; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm15[0,3,2,1]
5527; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7]
5528; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7]
5529; SSE-NEXT:    pand %xmm3, %xmm7
5530; SSE-NEXT:    pandn %xmm2, %xmm3
5531; SSE-NEXT:    por %xmm7, %xmm3
5532; SSE-NEXT:    pand %xmm5, %xmm6
5533; SSE-NEXT:    packuswb %xmm3, %xmm3
5534; SSE-NEXT:    pandn %xmm3, %xmm5
5535; SSE-NEXT:    por %xmm6, %xmm5
5536; SSE-NEXT:    movdqa %xmm12, %xmm2
5537; SSE-NEXT:    pxor %xmm0, %xmm0
5538; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
5539; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
5540; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
5541; SSE-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
5542; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm12[0,2,0,3]
5543; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
5544; SSE-NEXT:    pand %xmm4, %xmm3
5545; SSE-NEXT:    pandn %xmm2, %xmm4
5546; SSE-NEXT:    por %xmm3, %xmm4
5547; SSE-NEXT:    pand %xmm9, %xmm5
5548; SSE-NEXT:    packuswb %xmm4, %xmm2
5549; SSE-NEXT:    pandn %xmm2, %xmm9
5550; SSE-NEXT:    por %xmm5, %xmm9
5551; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5552; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5553; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
5554; SSE-NEXT:    movdqa %xmm0, %xmm1
5555; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255]
5556; SSE-NEXT:    pand %xmm10, %xmm1
5557; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,1,2,3]
5558; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
5559; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
5560; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
5561; SSE-NEXT:    packuswb %xmm2, %xmm1
5562; SSE-NEXT:    movdqa {{.*#+}} xmm15 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255]
5563; SSE-NEXT:    movdqa %xmm15, %xmm2
5564; SSE-NEXT:    pandn %xmm1, %xmm2
5565; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
5566; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
5567; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[3,1,2,0]
5568; SSE-NEXT:    pand %xmm10, %xmm1
5569; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
5570; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
5571; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm1[2,1,0,3,4,5,6,7]
5572; SSE-NEXT:    packuswb %xmm6, %xmm6
5573; SSE-NEXT:    pand %xmm15, %xmm6
5574; SSE-NEXT:    por %xmm2, %xmm6
5575; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5576; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,0,65535,65535,0]
5577; SSE-NEXT:    pand %xmm11, %xmm13
5578; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
5579; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm13[0,2,1,3]
5580; SSE-NEXT:    pand %xmm10, %xmm1
5581; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
5582; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
5583; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
5584; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
5585; SSE-NEXT:    packuswb %xmm1, %xmm2
5586; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,0]
5587; SSE-NEXT:    movdqa %xmm1, %xmm3
5588; SSE-NEXT:    pandn %xmm2, %xmm3
5589; SSE-NEXT:    pand %xmm1, %xmm6
5590; SSE-NEXT:    por %xmm6, %xmm3
5591; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5592; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5593; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
5594; SSE-NEXT:    movdqa %xmm14, %xmm2
5595; SSE-NEXT:    pand %xmm10, %xmm2
5596; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[2,1,2,3]
5597; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7]
5598; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
5599; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
5600; SSE-NEXT:    packuswb %xmm6, %xmm2
5601; SSE-NEXT:    movdqa %xmm15, %xmm6
5602; SSE-NEXT:    pandn %xmm2, %xmm6
5603; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5604; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5605; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[3,1,2,0]
5606; SSE-NEXT:    pand %xmm10, %xmm2
5607; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
5608; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
5609; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm2[2,1,0,3,4,5,6,7]
5610; SSE-NEXT:    packuswb %xmm7, %xmm7
5611; SSE-NEXT:    pand %xmm15, %xmm7
5612; SSE-NEXT:    por %xmm6, %xmm7
5613; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5614; SSE-NEXT:    pand %xmm11, %xmm3
5615; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5616; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3]
5617; SSE-NEXT:    pand %xmm10, %xmm2
5618; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
5619; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
5620; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
5621; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
5622; SSE-NEXT:    packuswb %xmm2, %xmm6
5623; SSE-NEXT:    movdqa %xmm1, %xmm2
5624; SSE-NEXT:    pandn %xmm6, %xmm2
5625; SSE-NEXT:    pand %xmm1, %xmm7
5626; SSE-NEXT:    por %xmm7, %xmm2
5627; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5628; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5629; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
5630; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5631; SSE-NEXT:    pand %xmm10, %xmm6
5632; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
5633; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7]
5634; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7]
5635; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2]
5636; SSE-NEXT:    packuswb %xmm7, %xmm6
5637; SSE-NEXT:    movdqa %xmm15, %xmm7
5638; SSE-NEXT:    pandn %xmm6, %xmm7
5639; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5640; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5641; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5642; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[3,1,2,0]
5643; SSE-NEXT:    pand %xmm10, %xmm6
5644; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7]
5645; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,1,0,3]
5646; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm6[2,1,0,3,4,5,6,7]
5647; SSE-NEXT:    packuswb %xmm8, %xmm8
5648; SSE-NEXT:    pand %xmm15, %xmm8
5649; SSE-NEXT:    por %xmm7, %xmm8
5650; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5651; SSE-NEXT:    pand %xmm11, %xmm2
5652; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5653; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5654; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,2,1,3]
5655; SSE-NEXT:    pand %xmm10, %xmm6
5656; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,1,2,1,4,5,6,7]
5657; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,1,3]
5658; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7]
5659; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,4,7]
5660; SSE-NEXT:    packuswb %xmm6, %xmm7
5661; SSE-NEXT:    movdqa %xmm1, %xmm2
5662; SSE-NEXT:    pandn %xmm7, %xmm2
5663; SSE-NEXT:    pand %xmm1, %xmm8
5664; SSE-NEXT:    por %xmm8, %xmm2
5665; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5666; SSE-NEXT:    movdqa (%rsp), %xmm7 # 16-byte Reload
5667; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
5668; SSE-NEXT:    movdqa %xmm7, (%rsp) # 16-byte Spill
5669; SSE-NEXT:    pand %xmm10, %xmm7
5670; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[2,1,2,3]
5671; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
5672; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
5673; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,1,0,2]
5674; SSE-NEXT:    packuswb %xmm8, %xmm7
5675; SSE-NEXT:    movdqa %xmm15, %xmm8
5676; SSE-NEXT:    pandn %xmm7, %xmm8
5677; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5678; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
5679; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[3,1,2,0]
5680; SSE-NEXT:    pand %xmm10, %xmm7
5681; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,6,7]
5682; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[2,1,0,3]
5683; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm7[2,1,0,3,4,5,6,7]
5684; SSE-NEXT:    packuswb %xmm9, %xmm9
5685; SSE-NEXT:    pand %xmm15, %xmm9
5686; SSE-NEXT:    por %xmm8, %xmm9
5687; SSE-NEXT:    movdqa %xmm11, %xmm2
5688; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
5689; SSE-NEXT:    pand %xmm11, %xmm7
5690; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5691; SSE-NEXT:    por %xmm7, %xmm2
5692; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[0,2,1,3]
5693; SSE-NEXT:    pand %xmm10, %xmm7
5694; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,1,2,1,4,5,6,7]
5695; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3]
5696; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7]
5697; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7]
5698; SSE-NEXT:    packuswb %xmm7, %xmm8
5699; SSE-NEXT:    movdqa %xmm1, %xmm7
5700; SSE-NEXT:    pandn %xmm8, %xmm7
5701; SSE-NEXT:    pand %xmm1, %xmm9
5702; SSE-NEXT:    por %xmm9, %xmm7
5703; SSE-NEXT:    movdqa %xmm0, %xmm8
5704; SSE-NEXT:    pxor %xmm5, %xmm5
5705; SSE-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15]
5706; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
5707; SSE-NEXT:    movdqa %xmm0, %xmm9
5708; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm8[0,0]
5709; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[2,3]
5710; SSE-NEXT:    psrlq $48, %xmm8
5711; SSE-NEXT:    psrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5712; SSE-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
5713; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm0[3,1,2,3,4,5,6,7]
5714; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
5715; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7]
5716; SSE-NEXT:    packuswb %xmm9, %xmm8
5717; SSE-NEXT:    movdqa %xmm15, %xmm10
5718; SSE-NEXT:    pandn %xmm8, %xmm10
5719; SSE-NEXT:    movdqa %xmm12, %xmm8
5720; SSE-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15]
5721; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[1,1,2,3]
5722; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
5723; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,65535]
5724; SSE-NEXT:    movdqa %xmm0, %xmm11
5725; SSE-NEXT:    pandn %xmm8, %xmm11
5726; SSE-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
5727; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm12[0,1,2,3,7,5,6,7]
5728; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
5729; SSE-NEXT:    pshuflw {{.*#+}} xmm12 = xmm8[3,1,1,2,4,5,6,7]
5730; SSE-NEXT:    pand %xmm0, %xmm12
5731; SSE-NEXT:    por %xmm11, %xmm12
5732; SSE-NEXT:    packuswb %xmm12, %xmm12
5733; SSE-NEXT:    pand %xmm15, %xmm12
5734; SSE-NEXT:    por %xmm10, %xmm12
5735; SSE-NEXT:    movdqa %xmm13, %xmm8
5736; SSE-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15]
5737; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,7,5,6,7]
5738; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0]
5739; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,7,4]
5740; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,0,65535,0,0]
5741; SSE-NEXT:    movdqa %xmm11, %xmm13
5742; SSE-NEXT:    pandn %xmm10, %xmm13
5743; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
5744; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,3,1,1]
5745; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
5746; SSE-NEXT:    pand %xmm11, %xmm8
5747; SSE-NEXT:    por %xmm8, %xmm13
5748; SSE-NEXT:    packuswb %xmm13, %xmm10
5749; SSE-NEXT:    movdqa %xmm1, %xmm8
5750; SSE-NEXT:    pandn %xmm10, %xmm8
5751; SSE-NEXT:    pand %xmm1, %xmm12
5752; SSE-NEXT:    por %xmm12, %xmm8
5753; SSE-NEXT:    movdqa %xmm14, %xmm9
5754; SSE-NEXT:    movdqa %xmm14, %xmm10
5755; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15]
5756; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7]
5757; SSE-NEXT:    movdqa %xmm9, %xmm12
5758; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[1,0],xmm10[0,0]
5759; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[2,0],xmm10[2,3]
5760; SSE-NEXT:    psrlq $48, %xmm10
5761; SSE-NEXT:    psrldq {{.*#+}} xmm12 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5762; SSE-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
5763; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm9[3,1,2,3,4,5,6,7]
5764; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3]
5765; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,5,7]
5766; SSE-NEXT:    packuswb %xmm12, %xmm10
5767; SSE-NEXT:    movdqa %xmm15, %xmm12
5768; SSE-NEXT:    pandn %xmm10, %xmm12
5769; SSE-NEXT:    movdqa %xmm4, %xmm10
5770; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15]
5771; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3]
5772; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5]
5773; SSE-NEXT:    movdqa %xmm0, %xmm14
5774; SSE-NEXT:    pandn %xmm10, %xmm14
5775; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
5776; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,7,5,6,7]
5777; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
5778; SSE-NEXT:    pshuflw {{.*#+}} xmm13 = xmm10[3,1,1,2,4,5,6,7]
5779; SSE-NEXT:    pand %xmm0, %xmm13
5780; SSE-NEXT:    por %xmm14, %xmm13
5781; SSE-NEXT:    packuswb %xmm13, %xmm13
5782; SSE-NEXT:    pand %xmm15, %xmm13
5783; SSE-NEXT:    por %xmm12, %xmm13
5784; SSE-NEXT:    movdqa %xmm3, %xmm10
5785; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
5786; SSE-NEXT:    pshufhw {{.*#+}} xmm12 = xmm3[0,1,2,3,7,5,6,7]
5787; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[0,1,2,0]
5788; SSE-NEXT:    pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,7,4]
5789; SSE-NEXT:    movdqa %xmm11, %xmm14
5790; SSE-NEXT:    pandn %xmm12, %xmm14
5791; SSE-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
5792; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,3,1,1]
5793; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
5794; SSE-NEXT:    pand %xmm11, %xmm10
5795; SSE-NEXT:    por %xmm10, %xmm14
5796; SSE-NEXT:    packuswb %xmm14, %xmm10
5797; SSE-NEXT:    movdqa %xmm1, %xmm12
5798; SSE-NEXT:    pandn %xmm10, %xmm12
5799; SSE-NEXT:    pand %xmm1, %xmm13
5800; SSE-NEXT:    por %xmm13, %xmm12
5801; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5802; SSE-NEXT:    movdqa %xmm9, %xmm10
5803; SSE-NEXT:    pxor %xmm3, %xmm3
5804; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15]
5805; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7]
5806; SSE-NEXT:    pxor %xmm4, %xmm4
5807; SSE-NEXT:    movdqa %xmm9, %xmm13
5808; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[1,0],xmm10[0,0]
5809; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[2,0],xmm10[2,3]
5810; SSE-NEXT:    psrlq $48, %xmm10
5811; SSE-NEXT:    psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5812; SSE-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
5813; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm9[3,1,2,3,4,5,6,7]
5814; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3]
5815; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,5,7]
5816; SSE-NEXT:    packuswb %xmm13, %xmm10
5817; SSE-NEXT:    movdqa %xmm15, %xmm13
5818; SSE-NEXT:    pandn %xmm10, %xmm13
5819; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5820; SSE-NEXT:    movdqa %xmm3, %xmm10
5821; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
5822; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3]
5823; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5]
5824; SSE-NEXT:    movdqa %xmm0, %xmm9
5825; SSE-NEXT:    pandn %xmm10, %xmm9
5826; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
5827; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,7,5,6,7]
5828; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
5829; SSE-NEXT:    pshuflw {{.*#+}} xmm14 = xmm10[3,1,1,2,4,5,6,7]
5830; SSE-NEXT:    pand %xmm0, %xmm14
5831; SSE-NEXT:    por %xmm9, %xmm14
5832; SSE-NEXT:    packuswb %xmm14, %xmm14
5833; SSE-NEXT:    pand %xmm15, %xmm14
5834; SSE-NEXT:    por %xmm13, %xmm14
5835; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5836; SSE-NEXT:    movdqa %xmm3, %xmm9
5837; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
5838; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,7,5,6,7]
5839; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0]
5840; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,7,4]
5841; SSE-NEXT:    movdqa %xmm11, %xmm13
5842; SSE-NEXT:    pandn %xmm10, %xmm13
5843; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
5844; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,3,1,1]
5845; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
5846; SSE-NEXT:    pand %xmm11, %xmm9
5847; SSE-NEXT:    por %xmm9, %xmm13
5848; SSE-NEXT:    packuswb %xmm13, %xmm9
5849; SSE-NEXT:    movdqa %xmm1, %xmm13
5850; SSE-NEXT:    pandn %xmm9, %xmm13
5851; SSE-NEXT:    pand %xmm1, %xmm14
5852; SSE-NEXT:    por %xmm14, %xmm13
5853; SSE-NEXT:    movdqa (%rsp), %xmm3 # 16-byte Reload
5854; SSE-NEXT:    movdqa %xmm3, %xmm9
5855; SSE-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15]
5856; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
5857; SSE-NEXT:    movdqa %xmm3, %xmm10
5858; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm9[0,0]
5859; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm9[2,3]
5860; SSE-NEXT:    psrlq $48, %xmm9
5861; SSE-NEXT:    psrldq {{.*#+}} xmm10 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5862; SSE-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
5863; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm3[3,1,2,3,4,5,6,7]
5864; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3]
5865; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,7]
5866; SSE-NEXT:    packuswb %xmm10, %xmm9
5867; SSE-NEXT:    movdqa %xmm6, %xmm10
5868; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15]
5869; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3]
5870; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5]
5871; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
5872; SSE-NEXT:    pshufhw {{.*#+}} xmm14 = xmm6[0,1,2,3,7,5,6,7]
5873; SSE-NEXT:    pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3]
5874; SSE-NEXT:    pshuflw {{.*#+}} xmm14 = xmm14[3,1,1,2,4,5,6,7]
5875; SSE-NEXT:    pand %xmm0, %xmm14
5876; SSE-NEXT:    pandn %xmm10, %xmm0
5877; SSE-NEXT:    por %xmm14, %xmm0
5878; SSE-NEXT:    packuswb %xmm0, %xmm0
5879; SSE-NEXT:    pand %xmm15, %xmm0
5880; SSE-NEXT:    pandn %xmm9, %xmm15
5881; SSE-NEXT:    por %xmm15, %xmm0
5882; SSE-NEXT:    movdqa %xmm2, %xmm4
5883; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
5884; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
5885; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,3,1,1]
5886; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
5887; SSE-NEXT:    pand %xmm11, %xmm4
5888; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,7,5,6,7]
5889; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
5890; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,7,4]
5891; SSE-NEXT:    pandn %xmm5, %xmm11
5892; SSE-NEXT:    por %xmm4, %xmm11
5893; SSE-NEXT:    pand %xmm1, %xmm0
5894; SSE-NEXT:    packuswb %xmm11, %xmm4
5895; SSE-NEXT:    pandn %xmm4, %xmm1
5896; SSE-NEXT:    por %xmm0, %xmm1
5897; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5898; SSE-NEXT:    movaps %xmm0, 16(%rsi)
5899; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5900; SSE-NEXT:    movaps %xmm0, 32(%rsi)
5901; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5902; SSE-NEXT:    movaps %xmm0, 48(%rsi)
5903; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5904; SSE-NEXT:    movaps %xmm0, (%rsi)
5905; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5906; SSE-NEXT:    movaps %xmm0, 16(%rdx)
5907; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5908; SSE-NEXT:    movaps %xmm0, 32(%rdx)
5909; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5910; SSE-NEXT:    movaps %xmm0, 48(%rdx)
5911; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5912; SSE-NEXT:    movaps %xmm0, (%rdx)
5913; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5914; SSE-NEXT:    movaps %xmm0, 16(%rcx)
5915; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5916; SSE-NEXT:    movaps %xmm0, 32(%rcx)
5917; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5918; SSE-NEXT:    movaps %xmm0, 48(%rcx)
5919; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5920; SSE-NEXT:    movaps %xmm0, (%rcx)
5921; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5922; SSE-NEXT:    movaps %xmm0, 16(%r8)
5923; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5924; SSE-NEXT:    movaps %xmm0, 32(%r8)
5925; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5926; SSE-NEXT:    movaps %xmm0, 48(%r8)
5927; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5928; SSE-NEXT:    movaps %xmm0, (%r8)
5929; SSE-NEXT:    movdqa %xmm7, 16(%r9)
5930; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5931; SSE-NEXT:    movaps %xmm0, 32(%r9)
5932; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5933; SSE-NEXT:    movaps %xmm0, 48(%r9)
5934; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5935; SSE-NEXT:    movaps %xmm0, (%r9)
5936; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5937; SSE-NEXT:    movdqa %xmm1, 16(%rax)
5938; SSE-NEXT:    movdqa %xmm13, 32(%rax)
5939; SSE-NEXT:    movdqa %xmm12, 48(%rax)
5940; SSE-NEXT:    movdqa %xmm8, (%rax)
5941; SSE-NEXT:    addq $792, %rsp # imm = 0x318
5942; SSE-NEXT:    retq
5943;
5944; AVX-LABEL: load_i8_stride6_vf64:
5945; AVX:       # %bb.0:
5946; AVX-NEXT:    subq $616, %rsp # imm = 0x268
5947; AVX-NEXT:    vmovdqa (%rdi), %xmm2
5948; AVX-NEXT:    vmovdqa 16(%rdi), %xmm7
5949; AVX-NEXT:    vmovdqa 32(%rdi), %xmm15
5950; AVX-NEXT:    vmovdqa 48(%rdi), %xmm6
5951; AVX-NEXT:    vmovdqa 224(%rdi), %xmm8
5952; AVX-NEXT:    vmovdqa 240(%rdi), %xmm9
5953; AVX-NEXT:    vmovdqa 208(%rdi), %xmm10
5954; AVX-NEXT:    vmovdqa 192(%rdi), %xmm11
5955; AVX-NEXT:    vmovq {{.*#+}} xmm3 = [128,128,128,4,10,0,0,0,0,0,0,0,0,0,0,0]
5956; AVX-NEXT:    vmovq {{.*#+}} xmm12 = [2,8,14,128,128,0,0,0,0,0,0,0,0,0,0,0]
5957; AVX-NEXT:    vmovd {{.*#+}} xmm13 = [2,8,14,0,0,0,0,0,0,0,0,0,0,0,0,0]
5958; AVX-NEXT:    vpshufb %xmm13, %xmm6, %xmm0
5959; AVX-NEXT:    vbroadcastss {{.*#+}} xmm14 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12]
5960; AVX-NEXT:    vpshufb %xmm14, %xmm15, %xmm1
5961; AVX-NEXT:    vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5962; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
5963; AVX-NEXT:    vpshufb %xmm3, %xmm7, %xmm0
5964; AVX-NEXT:    vmovdqa %xmm2, %xmm4
5965; AVX-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
5966; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm2
5967; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u]
5968; AVX-NEXT:    vpblendvb %xmm5, %xmm1, %xmm2, %xmm1
5969; AVX-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5970; AVX-NEXT:    vpshufb %xmm13, %xmm9, %xmm1
5971; AVX-NEXT:    vpshufb %xmm14, %xmm8, %xmm2
5972; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
5973; AVX-NEXT:    vpshufb %xmm3, %xmm10, %xmm2
5974; AVX-NEXT:    vpshufb %xmm12, %xmm11, %xmm3
5975; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
5976; AVX-NEXT:    vpblendvb %xmm5, %xmm1, %xmm2, %xmm1
5977; AVX-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5978; AVX-NEXT:    vmovd {{.*#+}} xmm13 = [3,9,15,0,0,0,0,0,0,0,0,0,0,0,0,0]
5979; AVX-NEXT:    vpshufb %xmm13, %xmm6, %xmm1
5980; AVX-NEXT:    vbroadcastss {{.*#+}} xmm0 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13]
5981; AVX-NEXT:    vpshufb %xmm0, %xmm15, %xmm2
5982; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
5983; AVX-NEXT:    vmovq {{.*#+}} xmm15 = [128,128,128,5,11,0,0,0,0,0,0,0,0,0,0,0]
5984; AVX-NEXT:    vmovq {{.*#+}} xmm12 = [3,9,15,128,128,0,0,0,0,0,0,0,0,0,0,0]
5985; AVX-NEXT:    vpshufb %xmm15, %xmm7, %xmm2
5986; AVX-NEXT:    vmovdqa %xmm7, %xmm14
5987; AVX-NEXT:    vpshufb %xmm12, %xmm4, %xmm3
5988; AVX-NEXT:    vmovdqa %xmm4, %xmm7
5989; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
5990; AVX-NEXT:    vpblendvb %xmm5, %xmm1, %xmm2, %xmm1
5991; AVX-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5992; AVX-NEXT:    vpshufb %xmm13, %xmm9, %xmm1
5993; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5994; AVX-NEXT:    vpshufb %xmm0, %xmm8, %xmm2
5995; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
5996; AVX-NEXT:    vpshufb %xmm15, %xmm10, %xmm2
5997; AVX-NEXT:    vpshufb %xmm12, %xmm11, %xmm3
5998; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
5999; AVX-NEXT:    vpblendvb %xmm5, %xmm1, %xmm2, %xmm0
6000; AVX-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6001; AVX-NEXT:    vmovq {{.*#+}} xmm12 = [4,10,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
6002; AVX-NEXT:    vpshufb %xmm12, %xmm4, %xmm0
6003; AVX-NEXT:    vmovq {{.*#+}} xmm13 = [128,128,0,6,12,0,0,0,0,0,0,0,0,0,0,0]
6004; AVX-NEXT:    vpshufb %xmm13, %xmm14, %xmm1
6005; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm1
6006; AVX-NEXT:    vbroadcastss {{.*#+}} xmm4 = [4,10,0,0,4,10,0,0,4,10,0,0,4,10,0,0]
6007; AVX-NEXT:    vpshufb %xmm4, %xmm6, %xmm0
6008; AVX-NEXT:    vbroadcastss {{.*#+}} xmm3 = [0,2,8,14,0,2,8,14,0,2,8,14,0,2,8,14]
6009; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6010; AVX-NEXT:    vpshufb %xmm3, %xmm5, %xmm2
6011; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
6012; AVX-NEXT:    vmovq {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
6013; AVX-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
6014; AVX-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6015; AVX-NEXT:    vpshufb %xmm12, %xmm11, %xmm1
6016; AVX-NEXT:    vpshufb %xmm13, %xmm10, %xmm2
6017; AVX-NEXT:    vmovdqa %xmm10, %xmm12
6018; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
6019; AVX-NEXT:    vpshufb %xmm4, %xmm9, %xmm2
6020; AVX-NEXT:    vmovdqa %xmm9, %xmm15
6021; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6022; AVX-NEXT:    vpshufb %xmm3, %xmm8, %xmm3
6023; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
6024; AVX-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
6025; AVX-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6026; AVX-NEXT:    vmovq {{.*#+}} xmm8 = [5,11,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
6027; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6028; AVX-NEXT:    vpshufb %xmm8, %xmm7, %xmm1
6029; AVX-NEXT:    vmovq {{.*#+}} xmm3 = [128,128,1,7,13,0,0,0,0,0,0,0,0,0,0,0]
6030; AVX-NEXT:    vmovdqa %xmm14, %xmm13
6031; AVX-NEXT:    vpshufb %xmm3, %xmm14, %xmm2
6032; AVX-NEXT:    vmovdqa %xmm3, %xmm14
6033; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
6034; AVX-NEXT:    vbroadcastss {{.*#+}} xmm9 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0]
6035; AVX-NEXT:    vpshufb %xmm9, %xmm6, %xmm2
6036; AVX-NEXT:    vmovdqa %xmm6, %xmm10
6037; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6038; AVX-NEXT:    vbroadcastss {{.*#+}} xmm4 = [0,3,9,15,0,3,9,15,0,3,9,15,0,3,9,15]
6039; AVX-NEXT:    vpshufb %xmm4, %xmm5, %xmm3
6040; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
6041; AVX-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
6042; AVX-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6043; AVX-NEXT:    vpshufb %xmm8, %xmm11, %xmm1
6044; AVX-NEXT:    vmovdqa %xmm11, %xmm8
6045; AVX-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6046; AVX-NEXT:    vpshufb %xmm14, %xmm12, %xmm2
6047; AVX-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6048; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
6049; AVX-NEXT:    vpshufb %xmm9, %xmm15, %xmm2
6050; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
6051; AVX-NEXT:    vpshufb %xmm4, %xmm6, %xmm3
6052; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
6053; AVX-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
6054; AVX-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6055; AVX-NEXT:    vmovq {{.*#+}} xmm3 = [128,128,128,2,8,14,0,0,0,0,0,0,0,0,0,0]
6056; AVX-NEXT:    vmovdqa 112(%rdi), %xmm0
6057; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6058; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
6059; AVX-NEXT:    vmovq {{.*#+}} xmm15 = [0,6,12,128,128,128,0,0,0,0,0,0,0,0,0,0]
6060; AVX-NEXT:    vmovdqa 96(%rdi), %xmm1
6061; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6062; AVX-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
6063; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
6064; AVX-NEXT:    vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10]
6065; AVX-NEXT:    # xmm11 = mem[0,0]
6066; AVX-NEXT:    vmovdqa 80(%rdi), %xmm1
6067; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6068; AVX-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
6069; AVX-NEXT:    vmovdqa 64(%rdi), %xmm2
6070; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6071; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
6072; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
6073; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm2
6074; AVX-NEXT:    vmovd {{.*#+}} xmm14 = [0,0,4,10,0,0,0,0,0,0,0,0,0,0,0,0]
6075; AVX-NEXT:    vpshufb %xmm14, %xmm5, %xmm4
6076; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [0,6,12,0,0,6,12,0,0,6,12,0,0,6,12,0]
6077; AVX-NEXT:    vpshufb %xmm1, %xmm10, %xmm5
6078; AVX-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
6079; AVX-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6080; AVX-NEXT:    vpshufb %xmm3, %xmm13, %xmm5
6081; AVX-NEXT:    vpshufb %xmm15, %xmm7, %xmm9
6082; AVX-NEXT:    vpor %xmm5, %xmm9, %xmm5
6083; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7]
6084; AVX-NEXT:    vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
6085; AVX-NEXT:    vandnps %ymm2, %ymm13, %ymm2
6086; AVX-NEXT:    vandps %ymm4, %ymm13, %ymm4
6087; AVX-NEXT:    vorps %ymm2, %ymm4, %ymm9
6088; AVX-NEXT:    vmovdqa 128(%rdi), %xmm2
6089; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6090; AVX-NEXT:    vpshufb %xmm14, %xmm2, %xmm2
6091; AVX-NEXT:    vmovdqa 144(%rdi), %xmm4
6092; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6093; AVX-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
6094; AVX-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
6095; AVX-NEXT:    vmovdqa 176(%rdi), %xmm2
6096; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6097; AVX-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
6098; AVX-NEXT:    vmovdqa 160(%rdi), %xmm5
6099; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6100; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128]
6101; AVX-NEXT:    # xmm0 = mem[0,0]
6102; AVX-NEXT:    vpshufb %xmm0, %xmm5, %xmm10
6103; AVX-NEXT:    vpor %xmm2, %xmm10, %xmm10
6104; AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
6105; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm10, %xmm10
6106; AVX-NEXT:    vmovdqa %ymm2, %ymm5
6107; AVX-NEXT:    vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
6108; AVX-NEXT:    vandps %ymm7, %ymm9, %ymm9
6109; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm0, %ymm10
6110; AVX-NEXT:    vandnps %ymm10, %ymm7, %ymm10
6111; AVX-NEXT:    vorps %ymm10, %ymm9, %ymm4
6112; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6113; AVX-NEXT:    vmovdqa 304(%rdi), %xmm2
6114; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6115; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm9
6116; AVX-NEXT:    vmovdqa 288(%rdi), %xmm2
6117; AVX-NEXT:    vmovdqa %xmm2, (%rsp) # 16-byte Spill
6118; AVX-NEXT:    vpshufb %xmm15, %xmm2, %xmm10
6119; AVX-NEXT:    vpor %xmm9, %xmm10, %xmm9
6120; AVX-NEXT:    vmovdqa 272(%rdi), %xmm2
6121; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6122; AVX-NEXT:    vpshufb %xmm11, %xmm2, %xmm10
6123; AVX-NEXT:    vmovdqa 256(%rdi), %xmm2
6124; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6125; AVX-NEXT:    vpshufb %xmm0, %xmm2, %xmm11
6126; AVX-NEXT:    vpor %xmm10, %xmm11, %xmm10
6127; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm10, %ymm9
6128; AVX-NEXT:    vpshufb %xmm14, %xmm6, %xmm10
6129; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6130; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm11
6131; AVX-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
6132; AVX-NEXT:    vpshufb %xmm3, %xmm12, %xmm3
6133; AVX-NEXT:    vpshufb %xmm15, %xmm8, %xmm6
6134; AVX-NEXT:    vpor %xmm3, %xmm6, %xmm3
6135; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm10[3,4,5],xmm3[6,7]
6136; AVX-NEXT:    vandnps %ymm9, %ymm13, %ymm6
6137; AVX-NEXT:    vandps %ymm3, %ymm13, %ymm3
6138; AVX-NEXT:    vmovaps %ymm13, %ymm11
6139; AVX-NEXT:    vorps %ymm6, %ymm3, %ymm3
6140; AVX-NEXT:    vmovdqa 320(%rdi), %xmm4
6141; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6142; AVX-NEXT:    vpshufb %xmm14, %xmm4, %xmm0
6143; AVX-NEXT:    vmovdqa 336(%rdi), %xmm10
6144; AVX-NEXT:    vpshufb %xmm1, %xmm10, %xmm1
6145; AVX-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6146; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6147; AVX-NEXT:    vmovdqa 368(%rdi), %xmm1
6148; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6149; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[4,10]
6150; AVX-NEXT:    vmovdqa 352(%rdi), %xmm4
6151; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6152; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
6153; AVX-NEXT:    vpor %xmm1, %xmm6, %xmm1
6154; AVX-NEXT:    vpblendvb %xmm5, %xmm0, %xmm1, %xmm0
6155; AVX-NEXT:    vandps %ymm7, %ymm3, %ymm1
6156; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
6157; AVX-NEXT:    vandnps %ymm0, %ymm7, %ymm0
6158; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
6159; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6160; AVX-NEXT:    vmovq {{.*#+}} xmm8 = [128,128,128,3,9,15,0,0,0,0,0,0,0,0,0,0]
6161; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6162; AVX-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
6163; AVX-NEXT:    vmovq {{.*#+}} xmm4 = [1,7,13,128,128,128,0,0,0,0,0,0,0,0,0,0]
6164; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6165; AVX-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
6166; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
6167; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6168; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[5,11]
6169; AVX-NEXT:    vmovddup {{.*#+}} xmm9 = [0,0,0,3,9,15,128,128,0,0,0,3,9,15,128,128]
6170; AVX-NEXT:    # xmm9 = mem[0,0]
6171; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6172; AVX-NEXT:    vpshufb %xmm9, %xmm5, %xmm14
6173; AVX-NEXT:    vpor %xmm1, %xmm14, %xmm1
6174; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm14
6175; AVX-NEXT:    vmovd {{.*#+}} xmm3 = [0,0,5,11,0,0,0,0,0,0,0,0,0,0,0,0]
6176; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6177; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
6178; AVX-NEXT:    vbroadcastss {{.*#+}} xmm7 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0]
6179; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
6180; AVX-NEXT:    vpshufb %xmm7, %xmm6, %xmm15
6181; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
6182; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
6183; AVX-NEXT:    vpshufb %xmm8, %xmm6, %xmm15
6184; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
6185; AVX-NEXT:    vpshufb %xmm4, %xmm6, %xmm13
6186; AVX-NEXT:    vpor %xmm15, %xmm13, %xmm13
6187; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3,4,5],xmm13[6,7]
6188; AVX-NEXT:    vandnps %ymm14, %ymm11, %ymm13
6189; AVX-NEXT:    vandps %ymm0, %ymm11, %ymm0
6190; AVX-NEXT:    vorps %ymm0, %ymm13, %ymm0
6191; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
6192; AVX-NEXT:    vpshufb %xmm3, %xmm12, %xmm13
6193; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
6194; AVX-NEXT:    vpshufb %xmm7, %xmm11, %xmm14
6195; AVX-NEXT:    vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
6196; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
6197; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [0,0,0,128,128,128,5,11,0,0,0,128,128,128,5,11]
6198; AVX-NEXT:    # xmm1 = mem[0,0]
6199; AVX-NEXT:    vpshufb %xmm1, %xmm6, %xmm14
6200; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
6201; AVX-NEXT:    vpshufb %xmm9, %xmm15, %xmm15
6202; AVX-NEXT:    vpor %xmm14, %xmm15, %xmm14
6203; AVX-NEXT:    vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
6204; AVX-NEXT:    vpblendvb %xmm15, %xmm13, %xmm14, %xmm13
6205; AVX-NEXT:    vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
6206; AVX-NEXT:    vandps %ymm0, %ymm15, %ymm0
6207; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm0, %ymm13
6208; AVX-NEXT:    vandnps %ymm13, %ymm15, %ymm13
6209; AVX-NEXT:    vorps %ymm0, %ymm13, %ymm0
6210; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6211; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6212; AVX-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
6213; AVX-NEXT:    vmovdqa (%rsp), %xmm13 # 16-byte Reload
6214; AVX-NEXT:    vpshufb %xmm4, %xmm13, %xmm13
6215; AVX-NEXT:    vpor %xmm0, %xmm13, %xmm0
6216; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
6217; AVX-NEXT:    vpshufb %xmm1, %xmm13, %xmm13
6218; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
6219; AVX-NEXT:    vpshufb %xmm9, %xmm14, %xmm14
6220; AVX-NEXT:    vpor %xmm13, %xmm14, %xmm13
6221; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm13, %ymm0
6222; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
6223; AVX-NEXT:    vpshufb %xmm3, %xmm13, %xmm13
6224; AVX-NEXT:    vpshufb %xmm7, %xmm2, %xmm14
6225; AVX-NEXT:    vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
6226; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6227; AVX-NEXT:    vpshufb %xmm8, %xmm2, %xmm8
6228; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6229; AVX-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
6230; AVX-NEXT:    vpor %xmm4, %xmm8, %xmm4
6231; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm13[3,4,5],xmm4[6,7]
6232; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
6233; AVX-NEXT:    vandnps %ymm0, %ymm1, %ymm0
6234; AVX-NEXT:    vandps %ymm1, %ymm4, %ymm4
6235; AVX-NEXT:    vorps %ymm0, %ymm4, %ymm0
6236; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6237; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm3
6238; AVX-NEXT:    vpshufb %xmm7, %xmm10, %xmm1
6239; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
6240; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
6241; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,11]
6242; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
6243; AVX-NEXT:    vpshufb %xmm9, %xmm4, %xmm4
6244; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
6245; AVX-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
6246; AVX-NEXT:    vpblendvb %xmm8, %xmm1, %xmm3, %xmm1
6247; AVX-NEXT:    vandps %ymm0, %ymm15, %ymm0
6248; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6249; AVX-NEXT:    vandnps %ymm1, %ymm15, %ymm1
6250; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
6251; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6252; AVX-NEXT:    vmovq {{.*#+}} xmm9 = [128,128,128,4,10,0,0,0,0,0,0,0,0,0,0,0]
6253; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
6254; AVX-NEXT:    vpshufb %xmm9, %xmm10, %xmm0
6255; AVX-NEXT:    vmovq {{.*#+}} xmm13 = [2,8,14,128,128,0,0,0,0,0,0,0,0,0,0,0]
6256; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6257; AVX-NEXT:    vpshufb %xmm13, %xmm1, %xmm1
6258; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm1
6259; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = [0,0,0,4,10,128,128,128,0,0,0,4,10,128,128,128]
6260; AVX-NEXT:    # xmm0 = mem[0,0]
6261; AVX-NEXT:    vpshufb %xmm0, %xmm5, %xmm4
6262; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,0,6,12,0,0,0,128,128,0,6,12]
6263; AVX-NEXT:    # xmm3 = mem[0,0]
6264; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6265; AVX-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
6266; AVX-NEXT:    vpor %xmm4, %xmm5, %xmm4
6267; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1
6268; AVX-NEXT:    vmovdqa %ymm8, %ymm9
6269; AVX-NEXT:    vandnps %ymm1, %ymm8, %ymm1
6270; AVX-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload
6271; AVX-NEXT:    vorps %ymm1, %ymm4, %ymm4
6272; AVX-NEXT:    vmovd {{.*#+}} xmm13 = [2,8,14,0,0,0,0,0,0,0,0,0,0,0,0,0]
6273; AVX-NEXT:    vpshufb %xmm13, %xmm11, %xmm1
6274; AVX-NEXT:    vbroadcastss {{.*#+}} xmm8 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12]
6275; AVX-NEXT:    vpshufb %xmm8, %xmm12, %xmm5
6276; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
6277; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
6278; AVX-NEXT:    vpshufb %xmm0, %xmm7, %xmm5
6279; AVX-NEXT:    vmovdqa %xmm6, %xmm15
6280; AVX-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
6281; AVX-NEXT:    vpor %xmm5, %xmm6, %xmm5
6282; AVX-NEXT:    vpblendvb %xmm9, %xmm1, %xmm5, %xmm5
6283; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
6284; AVX-NEXT:    vandps %ymm1, %ymm4, %ymm4
6285; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
6286; AVX-NEXT:    vandnps %ymm5, %ymm1, %ymm5
6287; AVX-NEXT:    vorps %ymm5, %ymm4, %ymm1
6288; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6289; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
6290; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm12[4,10,u,u,u,u,u,u,u,u,u,u,u]
6291; AVX-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
6292; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
6293; AVX-NEXT:    vpor %xmm4, %xmm5, %xmm4
6294; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
6295; AVX-NEXT:    vpshufb %xmm0, %xmm11, %xmm5
6296; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
6297; AVX-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
6298; AVX-NEXT:    vpor %xmm5, %xmm6, %xmm5
6299; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
6300; AVX-NEXT:    vandnps %ymm4, %ymm9, %ymm4
6301; AVX-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload
6302; AVX-NEXT:    vorps %ymm4, %ymm5, %ymm4
6303; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6304; AVX-NEXT:    vpshufb %xmm13, %xmm1, %xmm5
6305; AVX-NEXT:    vpshufb %xmm8, %xmm2, %xmm6
6306; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0]
6307; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6308; AVX-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
6309; AVX-NEXT:    vpshufb %xmm3, %xmm14, %xmm3
6310; AVX-NEXT:    vpor %xmm0, %xmm3, %xmm0
6311; AVX-NEXT:    vpblendvb %xmm9, %xmm5, %xmm0, %xmm0
6312; AVX-NEXT:    vmovaps {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
6313; AVX-NEXT:    vandps %ymm5, %ymm4, %ymm3
6314; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
6315; AVX-NEXT:    vandnps %ymm0, %ymm5, %ymm0
6316; AVX-NEXT:    vorps %ymm0, %ymm3, %ymm0
6317; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6318; AVX-NEXT:    vmovq {{.*#+}} xmm14 = [128,128,128,5,11,0,0,0,0,0,0,0,0,0,0,0]
6319; AVX-NEXT:    vpshufb %xmm14, %xmm10, %xmm0
6320; AVX-NEXT:    vmovq {{.*#+}} xmm13 = [3,9,15,128,128,0,0,0,0,0,0,0,0,0,0,0]
6321; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
6322; AVX-NEXT:    vpshufb %xmm13, %xmm14, %xmm3
6323; AVX-NEXT:    vpor %xmm0, %xmm3, %xmm0
6324; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = [0,0,0,5,11,128,128,128,0,0,0,5,11,128,128,128]
6325; AVX-NEXT:    # xmm3 = mem[0,0]
6326; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
6327; AVX-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
6328; AVX-NEXT:    vmovddup {{.*#+}} xmm5 = [0,0,0,128,128,1,7,13,0,0,0,128,128,1,7,13]
6329; AVX-NEXT:    # xmm5 = mem[0,0]
6330; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
6331; AVX-NEXT:    vpshufb %xmm5, %xmm10, %xmm6
6332; AVX-NEXT:    vpor %xmm4, %xmm6, %xmm4
6333; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
6334; AVX-NEXT:    vmovdqa %ymm9, %ymm13
6335; AVX-NEXT:    vandnps %ymm0, %ymm9, %ymm0
6336; AVX-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload
6337; AVX-NEXT:    vorps %ymm0, %ymm4, %ymm0
6338; AVX-NEXT:    vmovd {{.*#+}} xmm8 = [3,9,15,0,0,0,0,0,0,0,0,0,0,0,0,0]
6339; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
6340; AVX-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
6341; AVX-NEXT:    vbroadcastss {{.*#+}} xmm9 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13]
6342; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
6343; AVX-NEXT:    vpshufb %xmm9, %xmm6, %xmm6
6344; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
6345; AVX-NEXT:    vpshufb %xmm3, %xmm7, %xmm6
6346; AVX-NEXT:    vpshufb %xmm5, %xmm15, %xmm7
6347; AVX-NEXT:    vmovdqa %xmm15, %xmm8
6348; AVX-NEXT:    vpor %xmm6, %xmm7, %xmm6
6349; AVX-NEXT:    vpblendvb %xmm13, %xmm4, %xmm6, %xmm4
6350; AVX-NEXT:    vmovaps {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
6351; AVX-NEXT:    vandps %ymm7, %ymm0, %ymm0
6352; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
6353; AVX-NEXT:    vandnps %ymm4, %ymm7, %ymm4
6354; AVX-NEXT:    vorps %ymm4, %ymm0, %ymm0
6355; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6356; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm12[5,11,u,u,u,u,u,u,u,u,u,u,u]
6357; AVX-NEXT:    vmovdqa (%rsp), %xmm12 # 16-byte Reload
6358; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm12[3,9,15],zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u,u]
6359; AVX-NEXT:    vpor %xmm0, %xmm4, %xmm0
6360; AVX-NEXT:    vpshufb %xmm3, %xmm11, %xmm4
6361; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
6362; AVX-NEXT:    vpshufb %xmm5, %xmm15, %xmm6
6363; AVX-NEXT:    vpor %xmm4, %xmm6, %xmm4
6364; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
6365; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u]
6366; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6367; AVX-NEXT:    vpshufb %xmm9, %xmm1, %xmm6
6368; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
6369; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm3
6370; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
6371; AVX-NEXT:    vpshufb %xmm5, %xmm11, %xmm5
6372; AVX-NEXT:    vpor %xmm3, %xmm5, %xmm3
6373; AVX-NEXT:    vpblendvb %xmm13, %xmm4, %xmm3, %xmm3
6374; AVX-NEXT:    vandnps %ymm0, %ymm13, %ymm0
6375; AVX-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload
6376; AVX-NEXT:    vorps %ymm0, %ymm2, %ymm0
6377; AVX-NEXT:    vandps %ymm7, %ymm0, %ymm0
6378; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm2
6379; AVX-NEXT:    vandnps %ymm2, %ymm7, %ymm2
6380; AVX-NEXT:    vorps %ymm2, %ymm0, %ymm0
6381; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6382; AVX-NEXT:    vmovq {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
6383; AVX-NEXT:    vpshufb %xmm1, %xmm14, %xmm0
6384; AVX-NEXT:    vmovq {{.*#+}} xmm14 = [128,128,0,6,12,0,0,0,0,0,0,0,0,0,0,0]
6385; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
6386; AVX-NEXT:    vpshufb %xmm14, %xmm13, %xmm2
6387; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
6388; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = [0,0,128,128,128,2,8,14,0,0,128,128,128,2,8,14]
6389; AVX-NEXT:    # xmm3 = mem[0,0]
6390; AVX-NEXT:    vpshufb %xmm3, %xmm10, %xmm2
6391; AVX-NEXT:    vmovddup {{.*#+}} xmm4 = [0,0,0,6,12,128,128,128,0,0,0,6,12,128,128,128]
6392; AVX-NEXT:    # xmm4 = mem[0,0]
6393; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6394; AVX-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
6395; AVX-NEXT:    vpor %xmm2, %xmm5, %xmm2
6396; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm2
6397; AVX-NEXT:    vmovaps {{.*#+}} ymm10 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
6398; AVX-NEXT:    vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload
6399; AVX-NEXT:    vandps %ymm2, %ymm10, %ymm2
6400; AVX-NEXT:    vorps %ymm5, %ymm2, %ymm2
6401; AVX-NEXT:    vpshufb %xmm3, %xmm8, %xmm5
6402; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6403; AVX-NEXT:    vpshufb %xmm4, %xmm0, %xmm6
6404; AVX-NEXT:    vpor %xmm5, %xmm6, %xmm5
6405; AVX-NEXT:    vbroadcastss {{.*#+}} xmm8 = [4,10,0,0,4,10,0,0,4,10,0,0,4,10,0,0]
6406; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
6407; AVX-NEXT:    vpshufb %xmm8, %xmm14, %xmm6
6408; AVX-NEXT:    vbroadcastss {{.*#+}} xmm9 = [0,2,8,14,0,2,8,14,0,2,8,14,0,2,8,14]
6409; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6410; AVX-NEXT:    vpshufb %xmm9, %xmm0, %xmm7
6411; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1]
6412; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
6413; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
6414; AVX-NEXT:    vandps %ymm0, %ymm2, %ymm2
6415; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
6416; AVX-NEXT:    vandnps %ymm5, %ymm0, %ymm5
6417; AVX-NEXT:    vorps %ymm5, %ymm2, %ymm2
6418; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6419; AVX-NEXT:    vpshufb %xmm1, %xmm12, %xmm5
6420; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6421; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm1[0,6,12,u,u,u,u,u,u,u,u,u,u,u]
6422; AVX-NEXT:    vpor %xmm5, %xmm6, %xmm5
6423; AVX-NEXT:    vpshufb %xmm3, %xmm15, %xmm6
6424; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6425; AVX-NEXT:    vpshufb %xmm4, %xmm1, %xmm7
6426; AVX-NEXT:    vpor %xmm6, %xmm7, %xmm6
6427; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
6428; AVX-NEXT:    vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload
6429; AVX-NEXT:    vandps %ymm5, %ymm10, %ymm5
6430; AVX-NEXT:    vorps %ymm6, %ymm5, %ymm5
6431; AVX-NEXT:    vpshufb %xmm3, %xmm11, %xmm3
6432; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
6433; AVX-NEXT:    vpshufb %xmm4, %xmm11, %xmm4
6434; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
6435; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
6436; AVX-NEXT:    vpshufb %xmm8, %xmm12, %xmm4
6437; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6438; AVX-NEXT:    vpshufb %xmm9, %xmm1, %xmm6
6439; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm6[1],xmm4[1]
6440; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4],xmm3[5,6,7]
6441; AVX-NEXT:    vandps %ymm0, %ymm5, %ymm4
6442; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
6443; AVX-NEXT:    vandnps %ymm3, %ymm0, %ymm3
6444; AVX-NEXT:    vorps %ymm3, %ymm4, %ymm2
6445; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6446; AVX-NEXT:    vmovq {{.*#+}} xmm15 = [5,11,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
6447; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6448; AVX-NEXT:    vpshufb %xmm15, %xmm2, %xmm3
6449; AVX-NEXT:    vmovq {{.*#+}} xmm4 = [128,128,1,7,13,0,0,0,0,0,0,0,0,0,0,0]
6450; AVX-NEXT:    vpshufb %xmm4, %xmm13, %xmm5
6451; AVX-NEXT:    vpor %xmm3, %xmm5, %xmm3
6452; AVX-NEXT:    vmovddup {{.*#+}} xmm5 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15]
6453; AVX-NEXT:    # xmm5 = mem[0,0]
6454; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6455; AVX-NEXT:    vpshufb %xmm5, %xmm2, %xmm7
6456; AVX-NEXT:    vmovddup {{.*#+}} xmm6 = [0,0,1,7,13,128,128,128,0,0,1,7,13,128,128,128]
6457; AVX-NEXT:    # xmm6 = mem[0,0]
6458; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6459; AVX-NEXT:    vpshufb %xmm6, %xmm2, %xmm8
6460; AVX-NEXT:    vpor %xmm7, %xmm8, %xmm7
6461; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm7, %ymm3
6462; AVX-NEXT:    vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload
6463; AVX-NEXT:    vandps %ymm3, %ymm10, %ymm3
6464; AVX-NEXT:    vorps %ymm7, %ymm3, %ymm3
6465; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6466; AVX-NEXT:    vpshufb %xmm5, %xmm2, %xmm7
6467; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6468; AVX-NEXT:    vpshufb %xmm6, %xmm0, %xmm8
6469; AVX-NEXT:    vpor %xmm7, %xmm8, %xmm7
6470; AVX-NEXT:    vbroadcastss {{.*#+}} xmm13 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0]
6471; AVX-NEXT:    vpshufb %xmm13, %xmm14, %xmm8
6472; AVX-NEXT:    vbroadcastss {{.*#+}} xmm14 = [0,3,9,15,0,3,9,15,0,3,9,15,0,3,9,15]
6473; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6474; AVX-NEXT:    vpshufb %xmm14, %xmm0, %xmm9
6475; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm8 = xmm9[1],xmm8[1]
6476; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4],xmm7[5,6,7]
6477; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
6478; AVX-NEXT:    vandps %ymm2, %ymm3, %ymm3
6479; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
6480; AVX-NEXT:    vandnps %ymm7, %ymm2, %ymm7
6481; AVX-NEXT:    vorps %ymm7, %ymm3, %ymm3
6482; AVX-NEXT:    vmovdqa (%rsp), %xmm7 # 16-byte Reload
6483; AVX-NEXT:    vpshufb %xmm15, %xmm7, %xmm7
6484; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
6485; AVX-NEXT:    vpshufb %xmm4, %xmm8, %xmm8
6486; AVX-NEXT:    vpor %xmm7, %xmm8, %xmm7
6487; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
6488; AVX-NEXT:    vpshufb %xmm5, %xmm4, %xmm8
6489; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
6490; AVX-NEXT:    vpshufb %xmm6, %xmm4, %xmm9
6491; AVX-NEXT:    vpor %xmm8, %xmm9, %xmm8
6492; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm8, %ymm7
6493; AVX-NEXT:    vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload
6494; AVX-NEXT:    vandps %ymm7, %ymm10, %ymm0
6495; AVX-NEXT:    vorps %ymm0, %ymm8, %ymm0
6496; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
6497; AVX-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
6498; AVX-NEXT:    vpshufb %xmm6, %xmm11, %xmm6
6499; AVX-NEXT:    vpor %xmm5, %xmm6, %xmm5
6500; AVX-NEXT:    vpshufb %xmm13, %xmm12, %xmm6
6501; AVX-NEXT:    vpshufb %xmm14, %xmm1, %xmm7
6502; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1]
6503; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
6504; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
6505; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm5
6506; AVX-NEXT:    vandnps %ymm5, %ymm2, %ymm1
6507; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
6508; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6509; AVX-NEXT:    vmovaps %ymm1, 32(%rsi)
6510; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6511; AVX-NEXT:    vmovaps %ymm1, (%rsi)
6512; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6513; AVX-NEXT:    vmovaps %ymm1, 32(%rdx)
6514; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6515; AVX-NEXT:    vmovaps %ymm1, (%rdx)
6516; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6517; AVX-NEXT:    vmovaps %ymm1, 32(%rcx)
6518; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6519; AVX-NEXT:    vmovaps %ymm1, (%rcx)
6520; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6521; AVX-NEXT:    vmovaps %ymm1, 32(%r8)
6522; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6523; AVX-NEXT:    vmovaps %ymm1, (%r8)
6524; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6525; AVX-NEXT:    vmovaps %ymm1, 32(%r9)
6526; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6527; AVX-NEXT:    vmovaps %ymm1, (%r9)
6528; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6529; AVX-NEXT:    vmovaps %ymm0, 32(%rax)
6530; AVX-NEXT:    vmovaps %ymm3, (%rax)
6531; AVX-NEXT:    addq $616, %rsp # imm = 0x268
6532; AVX-NEXT:    vzeroupper
6533; AVX-NEXT:    retq
6534;
6535; AVX2-LABEL: load_i8_stride6_vf64:
6536; AVX2:       # %bb.0:
6537; AVX2-NEXT:    subq $328, %rsp # imm = 0x148
6538; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm7
6539; AVX2-NEXT:    vmovdqa (%rdi), %ymm3
6540; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm5
6541; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6542; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm0
6543; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm1
6544; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0]
6545; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[0,1],ymm1[0,1]
6546; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6547; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
6548; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6549; AVX2-NEXT:    vpblendvb %ymm13, %ymm2, %ymm0, %ymm4
6550; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
6551; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
6552; AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm5, %ymm2
6553; AVX2-NEXT:    vmovdqa %ymm3, %ymm5
6554; AVX2-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
6555; AVX2-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
6556; AVX2-NEXT:    vpshufb %xmm10, %xmm2, %xmm9
6557; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
6558; AVX2-NEXT:    vmovdqa {{.*#+}} xmm11 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
6559; AVX2-NEXT:    vpshufb %xmm11, %xmm3, %xmm12
6560; AVX2-NEXT:    vpor %xmm9, %xmm12, %xmm9
6561; AVX2-NEXT:    vpmovsxdq {{.*#+}} xmm12 = [18446744073709551615,16777215]
6562; AVX2-NEXT:    vpblendvb %ymm12, %ymm9, %ymm0, %ymm0
6563; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6564; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm8
6565; AVX2-NEXT:    vpblendvb %ymm1, %ymm7, %ymm8, %ymm14
6566; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6567; AVX2-NEXT:    vpshufb %xmm10, %xmm14, %xmm0
6568; AVX2-NEXT:    vextracti128 $1, %ymm14, %xmm15
6569; AVX2-NEXT:    vpshufb %xmm11, %xmm15, %xmm10
6570; AVX2-NEXT:    vpor %xmm0, %xmm10, %xmm1
6571; AVX2-NEXT:    vmovdqa 288(%rdi), %ymm11
6572; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm0
6573; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm0[0,1],ymm11[0,1]
6574; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm0[2,3],ymm11[2,3]
6575; AVX2-NEXT:    vpblendvb %ymm13, %ymm9, %ymm11, %ymm13
6576; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
6577; AVX2-NEXT:    vpblendvb %ymm12, %ymm1, %ymm0, %ymm0
6578; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6579; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u]
6580; AVX2-NEXT:    vpshufb %xmm0, %xmm2, %xmm1
6581; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u]
6582; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
6583; AVX2-NEXT:    vpor %xmm1, %xmm3, %xmm1
6584; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11]
6585; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm4
6586; AVX2-NEXT:    vpblendvb %ymm12, %ymm1, %ymm4, %ymm1
6587; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6588; AVX2-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
6589; AVX2-NEXT:    vpshufb %xmm2, %xmm15, %xmm1
6590; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
6591; AVX2-NEXT:    vpshufb %ymm3, %ymm13, %ymm1
6592; AVX2-NEXT:    vpblendvb %ymm12, %ymm0, %ymm1, %ymm0
6593; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6594; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
6595; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
6596; AVX2-NEXT:    vpblendvb %ymm13, %ymm10, %ymm5, %ymm1
6597; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
6598; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u]
6599; AVX2-NEXT:    vpshufb %xmm6, %xmm2, %xmm3
6600; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u]
6601; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm5
6602; AVX2-NEXT:    vpor %xmm3, %xmm5, %xmm5
6603; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12]
6604; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0]
6605; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6606; AVX2-NEXT:    vpblendvb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
6607; AVX2-NEXT:    vpshufb %ymm14, %ymm3, %ymm15
6608; AVX2-NEXT:    vpblendvb %ymm12, %ymm5, %ymm15, %ymm5
6609; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6610; AVX2-NEXT:    vpblendvb %ymm13, %ymm8, %ymm7, %ymm5
6611; AVX2-NEXT:    vmovdqa %ymm8, %ymm7
6612; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm15
6613; AVX2-NEXT:    vpshufb %xmm6, %xmm15, %xmm6
6614; AVX2-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
6615; AVX2-NEXT:    vpor %xmm6, %xmm4, %xmm4
6616; AVX2-NEXT:    vpblendvb %ymm0, %ymm11, %ymm9, %ymm0
6617; AVX2-NEXT:    vpshufb %ymm14, %ymm0, %ymm6
6618; AVX2-NEXT:    vpblendvb %ymm12, %ymm4, %ymm6, %ymm4
6619; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6620; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u]
6621; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
6622; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u]
6623; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
6624; AVX2-NEXT:    vpor %xmm2, %xmm1, %xmm1
6625; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13]
6626; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm3
6627; AVX2-NEXT:    vpblendvb %ymm12, %ymm1, %ymm3, %ymm1
6628; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6629; AVX2-NEXT:    vpshufb %xmm4, %xmm15, %xmm1
6630; AVX2-NEXT:    vpshufb %xmm6, %xmm5, %xmm3
6631; AVX2-NEXT:    vpor %xmm1, %xmm3, %xmm1
6632; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
6633; AVX2-NEXT:    vpblendvb %ymm12, %ymm1, %ymm0, %ymm14
6634; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm0
6635; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm3
6636; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
6637; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm3, %ymm1
6638; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6639; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
6640; AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm5
6641; AVX2-NEXT:    vpblendvb %ymm13, %ymm0, %ymm3, %ymm15
6642; AVX2-NEXT:    vmovdqa 352(%rdi), %ymm4
6643; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm6
6644; AVX2-NEXT:    vpblendvb %ymm1, %ymm6, %ymm4, %ymm1
6645; AVX2-NEXT:    vpblendvb %ymm13, %ymm4, %ymm6, %ymm12
6646; AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm6, %ymm0
6647; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6648; AVX2-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
6649; AVX2-NEXT:    vpblendvb %ymm2, %ymm10, %ymm0, %ymm8
6650; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6651; AVX2-NEXT:    vpblendvb %ymm2, %ymm7, %ymm0, %ymm10
6652; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0]
6653; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6654; AVX2-NEXT:    vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
6655; AVX2-NEXT:    vpblendvb %ymm2, %ymm11, %ymm9, %ymm0
6656; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6657; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128]
6658; AVX2-NEXT:    vpshufb %xmm2, %xmm5, %xmm6
6659; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm11
6660; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12]
6661; AVX2-NEXT:    vpshufb %xmm0, %xmm11, %xmm9
6662; AVX2-NEXT:    vpor %xmm6, %xmm9, %xmm6
6663; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
6664; AVX2-NEXT:    vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9
6665; AVX2-NEXT:    vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload
6666; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6667; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
6668; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm6
6669; AVX2-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
6670; AVX2-NEXT:    vpor %xmm2, %xmm0, %xmm0
6671; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6672; AVX2-NEXT:    vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6673; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6674; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128]
6675; AVX2-NEXT:    vpshufb %xmm0, %xmm5, %xmm2
6676; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13]
6677; AVX2-NEXT:    vpshufb %xmm5, %xmm11, %xmm11
6678; AVX2-NEXT:    vpor %xmm2, %xmm11, %xmm2
6679; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
6680; AVX2-NEXT:    vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
6681; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6682; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
6683; AVX2-NEXT:    vpshufb %xmm5, %xmm6, %xmm1
6684; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
6685; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6686; AVX2-NEXT:    vpblendvb %ymm9, %ymm14, %ymm0, %ymm0
6687; AVX2-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
6688; AVX2-NEXT:    vextracti128 $1, %ymm15, %xmm14
6689; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14]
6690; AVX2-NEXT:    vpshufb %xmm7, %xmm14, %xmm0
6691; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128]
6692; AVX2-NEXT:    vpshufb %xmm2, %xmm15, %xmm1
6693; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm1
6694; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm3
6695; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u]
6696; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
6697; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u]
6698; AVX2-NEXT:    vpshufb %xmm4, %xmm8, %xmm11
6699; AVX2-NEXT:    vpor %xmm6, %xmm11, %xmm6
6700; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm11 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14]
6701; AVX2-NEXT:    vpshufb %ymm11, %ymm13, %ymm0
6702; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm0[5,6,7]
6703; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
6704; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
6705; AVX2-NEXT:    vpblendvb %ymm9, %ymm0, %ymm1, %ymm0
6706; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6707; AVX2-NEXT:    vextracti128 $1, %ymm12, %xmm1
6708; AVX2-NEXT:    vpshufb %xmm7, %xmm1, %xmm0
6709; AVX2-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
6710; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
6711; AVX2-NEXT:    vextracti128 $1, %ymm10, %xmm2
6712; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm5
6713; AVX2-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
6714; AVX2-NEXT:    vpor %xmm5, %xmm4, %xmm4
6715; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6716; AVX2-NEXT:    vpshufb %ymm11, %ymm6, %ymm5
6717; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
6718; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
6719; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6720; AVX2-NEXT:    vpblendvb %ymm9, %ymm4, %ymm0, %ymm5
6721; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15]
6722; AVX2-NEXT:    vpshufb %xmm0, %xmm14, %xmm4
6723; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128]
6724; AVX2-NEXT:    vpshufb %xmm7, %xmm15, %xmm11
6725; AVX2-NEXT:    vpor %xmm4, %xmm11, %xmm4
6726; AVX2-NEXT:    vmovdqa {{.*#+}} xmm11 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u]
6727; AVX2-NEXT:    vpshufb %xmm11, %xmm3, %xmm3
6728; AVX2-NEXT:    vmovdqa {{.*#+}} xmm14 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u]
6729; AVX2-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
6730; AVX2-NEXT:    vpor %xmm3, %xmm8, %xmm3
6731; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15]
6732; AVX2-NEXT:    vpshufb %ymm8, %ymm13, %ymm13
6733; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm13[5,6,7]
6734; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4,5,6,7]
6735; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
6736; AVX2-NEXT:    vpblendvb %ymm9, %ymm3, %ymm4, %ymm4
6737; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
6738; AVX2-NEXT:    vpshufb %xmm7, %xmm12, %xmm1
6739; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
6740; AVX2-NEXT:    vpshufb %xmm11, %xmm2, %xmm1
6741; AVX2-NEXT:    vpshufb %xmm14, %xmm10, %xmm2
6742; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
6743; AVX2-NEXT:    vpshufb %ymm8, %ymm6, %ymm2
6744; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
6745; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
6746; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6747; AVX2-NEXT:    vpblendvb %ymm9, %ymm1, %ymm0, %ymm3
6748; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
6749; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm0
6750; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10]
6751; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm2
6752; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128]
6753; AVX2-NEXT:    vpshufb %xmm7, %xmm9, %xmm8
6754; AVX2-NEXT:    vmovdqa %ymm9, %ymm10
6755; AVX2-NEXT:    vpor %xmm2, %xmm8, %xmm2
6756; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
6757; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
6758; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15]
6759; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
6760; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
6761; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm8
6762; AVX2-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
6763; AVX2-NEXT:    vpshufb %xmm7, %xmm9, %xmm7
6764; AVX2-NEXT:    vmovdqa %ymm9, %ymm11
6765; AVX2-NEXT:    vpor %xmm1, %xmm7, %xmm1
6766; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
6767; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
6768; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7],ymm7[8,9,10],ymm1[11,12,13,14,15]
6769; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
6770; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11]
6771; AVX2-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
6772; AVX2-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128]
6773; AVX2-NEXT:    vpshufb %xmm9, %xmm10, %xmm10
6774; AVX2-NEXT:    vpor %xmm0, %xmm10, %xmm0
6775; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6776; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
6777; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15]
6778; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
6779; AVX2-NEXT:    vpshufb %xmm7, %xmm8, %xmm7
6780; AVX2-NEXT:    vpshufb %xmm9, %xmm11, %xmm8
6781; AVX2-NEXT:    vpor %xmm7, %xmm8, %xmm7
6782; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
6783; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
6784; AVX2-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15]
6785; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
6786; AVX2-NEXT:    vmovdqa %ymm1, 32(%rsi)
6787; AVX2-NEXT:    vmovdqa %ymm2, (%rsi)
6788; AVX2-NEXT:    vmovdqa %ymm7, 32(%rdx)
6789; AVX2-NEXT:    vmovdqa %ymm0, (%rdx)
6790; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6791; AVX2-NEXT:    vmovaps %ymm0, 32(%rcx)
6792; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6793; AVX2-NEXT:    vmovaps %ymm0, (%rcx)
6794; AVX2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
6795; AVX2-NEXT:    vmovaps %ymm0, 32(%r8)
6796; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6797; AVX2-NEXT:    vmovaps %ymm0, (%r8)
6798; AVX2-NEXT:    vmovdqa %ymm5, 32(%r9)
6799; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6800; AVX2-NEXT:    vmovaps %ymm0, (%r9)
6801; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6802; AVX2-NEXT:    vmovdqa %ymm3, 32(%rax)
6803; AVX2-NEXT:    vmovdqa %ymm4, (%rax)
6804; AVX2-NEXT:    addq $328, %rsp # imm = 0x148
6805; AVX2-NEXT:    vzeroupper
6806; AVX2-NEXT:    retq
6807;
6808; AVX2-FP-LABEL: load_i8_stride6_vf64:
6809; AVX2-FP:       # %bb.0:
6810; AVX2-FP-NEXT:    subq $328, %rsp # imm = 0x148
6811; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm7
6812; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm3
6813; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm5
6814; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6815; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm0
6816; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm1
6817; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0]
6818; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[0,1],ymm1[0,1]
6819; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6820; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
6821; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6822; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm2, %ymm0, %ymm4
6823; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
6824; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
6825; AVX2-FP-NEXT:    vpblendvb %ymm1, %ymm3, %ymm5, %ymm2
6826; AVX2-FP-NEXT:    vmovdqa %ymm3, %ymm5
6827; AVX2-FP-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
6828; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
6829; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm2, %xmm9
6830; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm3
6831; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm11 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
6832; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm3, %xmm12
6833; AVX2-FP-NEXT:    vpor %xmm9, %xmm12, %xmm9
6834; AVX2-FP-NEXT:    vpmovsxdq {{.*#+}} xmm12 = [18446744073709551615,16777215]
6835; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm9, %ymm0, %ymm0
6836; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6837; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm8
6838; AVX2-FP-NEXT:    vpblendvb %ymm1, %ymm7, %ymm8, %ymm14
6839; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6840; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm14, %xmm0
6841; AVX2-FP-NEXT:    vextracti128 $1, %ymm14, %xmm15
6842; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm15, %xmm10
6843; AVX2-FP-NEXT:    vpor %xmm0, %xmm10, %xmm1
6844; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %ymm11
6845; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm0
6846; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm0[0,1],ymm11[0,1]
6847; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm0[2,3],ymm11[2,3]
6848; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm9, %ymm11, %ymm13
6849; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
6850; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm1, %ymm0, %ymm0
6851; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6852; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u]
6853; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm2, %xmm1
6854; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u]
6855; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
6856; AVX2-FP-NEXT:    vpor %xmm1, %xmm3, %xmm1
6857; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11]
6858; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm4, %ymm4
6859; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm1, %ymm4, %ymm1
6860; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6861; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
6862; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm15, %xmm1
6863; AVX2-FP-NEXT:    vpor %xmm0, %xmm1, %xmm0
6864; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm13, %ymm1
6865; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm0, %ymm1, %ymm0
6866; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6867; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
6868; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
6869; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm10, %ymm5, %ymm1
6870; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm2
6871; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u]
6872; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm2, %xmm3
6873; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u]
6874; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm1, %xmm5
6875; AVX2-FP-NEXT:    vpor %xmm3, %xmm5, %xmm5
6876; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12]
6877; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0]
6878; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6879; AVX2-FP-NEXT:    vpblendvb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
6880; AVX2-FP-NEXT:    vpshufb %ymm14, %ymm3, %ymm15
6881; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm5, %ymm15, %ymm5
6882; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6883; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm8, %ymm7, %ymm5
6884; AVX2-FP-NEXT:    vmovdqa %ymm8, %ymm7
6885; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm15
6886; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm15, %xmm6
6887; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
6888; AVX2-FP-NEXT:    vpor %xmm6, %xmm4, %xmm4
6889; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm11, %ymm9, %ymm0
6890; AVX2-FP-NEXT:    vpshufb %ymm14, %ymm0, %ymm6
6891; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm4, %ymm6, %ymm4
6892; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6893; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u]
6894; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
6895; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm6 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u]
6896; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
6897; AVX2-FP-NEXT:    vpor %xmm2, %xmm1, %xmm1
6898; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13]
6899; AVX2-FP-NEXT:    vpshufb %ymm2, %ymm3, %ymm3
6900; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm1, %ymm3, %ymm1
6901; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6902; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm15, %xmm1
6903; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm5, %xmm3
6904; AVX2-FP-NEXT:    vpor %xmm1, %xmm3, %xmm1
6905; AVX2-FP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
6906; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm1, %ymm0, %ymm14
6907; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm0
6908; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm3
6909; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
6910; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm3, %ymm1
6911; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6912; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
6913; AVX2-FP-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm5
6914; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm0, %ymm3, %ymm15
6915; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %ymm4
6916; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm6
6917; AVX2-FP-NEXT:    vpblendvb %ymm1, %ymm6, %ymm4, %ymm1
6918; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm4, %ymm6, %ymm12
6919; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm4, %ymm6, %ymm0
6920; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6921; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
6922; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm10, %ymm0, %ymm8
6923; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6924; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm7, %ymm0, %ymm10
6925; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0]
6926; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6927; AVX2-FP-NEXT:    vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
6928; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm11, %ymm9, %ymm0
6929; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6930; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128]
6931; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm5, %xmm6
6932; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm11
6933; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12]
6934; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm11, %xmm9
6935; AVX2-FP-NEXT:    vpor %xmm6, %xmm9, %xmm6
6936; AVX2-FP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
6937; AVX2-FP-NEXT:    vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9
6938; AVX2-FP-NEXT:    vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload
6939; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6940; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
6941; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm6
6942; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
6943; AVX2-FP-NEXT:    vpor %xmm2, %xmm0, %xmm0
6944; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6945; AVX2-FP-NEXT:    vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6946; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6947; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128]
6948; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm5, %xmm2
6949; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13]
6950; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm11, %xmm11
6951; AVX2-FP-NEXT:    vpor %xmm2, %xmm11, %xmm2
6952; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
6953; AVX2-FP-NEXT:    vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
6954; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6955; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
6956; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm6, %xmm1
6957; AVX2-FP-NEXT:    vpor %xmm0, %xmm1, %xmm0
6958; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6959; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm14, %ymm0, %ymm0
6960; AVX2-FP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
6961; AVX2-FP-NEXT:    vextracti128 $1, %ymm15, %xmm14
6962; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14]
6963; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm14, %xmm0
6964; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128]
6965; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm15, %xmm1
6966; AVX2-FP-NEXT:    vpor %xmm0, %xmm1, %xmm1
6967; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm3
6968; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u]
6969; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
6970; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u]
6971; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm8, %xmm11
6972; AVX2-FP-NEXT:    vpor %xmm6, %xmm11, %xmm6
6973; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm11 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14]
6974; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm13, %ymm0
6975; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm0[5,6,7]
6976; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
6977; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
6978; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm0, %ymm1, %ymm0
6979; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6980; AVX2-FP-NEXT:    vextracti128 $1, %ymm12, %xmm1
6981; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm1, %xmm0
6982; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
6983; AVX2-FP-NEXT:    vpor %xmm0, %xmm2, %xmm0
6984; AVX2-FP-NEXT:    vextracti128 $1, %ymm10, %xmm2
6985; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm2, %xmm5
6986; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
6987; AVX2-FP-NEXT:    vpor %xmm5, %xmm4, %xmm4
6988; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6989; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm6, %ymm5
6990; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
6991; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
6992; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6993; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm4, %ymm0, %ymm5
6994; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15]
6995; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm14, %xmm4
6996; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128]
6997; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm15, %xmm11
6998; AVX2-FP-NEXT:    vpor %xmm4, %xmm11, %xmm4
6999; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm11 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u]
7000; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm3, %xmm3
7001; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm14 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u]
7002; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
7003; AVX2-FP-NEXT:    vpor %xmm3, %xmm8, %xmm3
7004; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15]
7005; AVX2-FP-NEXT:    vpshufb %ymm8, %ymm13, %ymm13
7006; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm13[5,6,7]
7007; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4,5,6,7]
7008; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
7009; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm3, %ymm4, %ymm4
7010; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
7011; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm12, %xmm1
7012; AVX2-FP-NEXT:    vpor %xmm0, %xmm1, %xmm0
7013; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm2, %xmm1
7014; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm10, %xmm2
7015; AVX2-FP-NEXT:    vpor %xmm1, %xmm2, %xmm1
7016; AVX2-FP-NEXT:    vpshufb %ymm8, %ymm6, %ymm2
7017; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
7018; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
7019; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
7020; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm1, %ymm0, %ymm3
7021; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
7022; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm0
7023; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10]
7024; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm0, %xmm2
7025; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128]
7026; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm9, %xmm8
7027; AVX2-FP-NEXT:    vmovdqa %ymm9, %ymm10
7028; AVX2-FP-NEXT:    vpor %xmm2, %xmm8, %xmm2
7029; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
7030; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
7031; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15]
7032; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
7033; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
7034; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm8
7035; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
7036; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm9, %xmm7
7037; AVX2-FP-NEXT:    vmovdqa %ymm9, %ymm11
7038; AVX2-FP-NEXT:    vpor %xmm1, %xmm7, %xmm1
7039; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
7040; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
7041; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7],ymm7[8,9,10],ymm1[11,12,13,14,15]
7042; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
7043; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11]
7044; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
7045; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128]
7046; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm10, %xmm10
7047; AVX2-FP-NEXT:    vpor %xmm0, %xmm10, %xmm0
7048; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
7049; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
7050; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15]
7051; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
7052; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm8, %xmm7
7053; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm11, %xmm8
7054; AVX2-FP-NEXT:    vpor %xmm7, %xmm8, %xmm7
7055; AVX2-FP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
7056; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
7057; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15]
7058; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
7059; AVX2-FP-NEXT:    vmovdqa %ymm1, 32(%rsi)
7060; AVX2-FP-NEXT:    vmovdqa %ymm2, (%rsi)
7061; AVX2-FP-NEXT:    vmovdqa %ymm7, 32(%rdx)
7062; AVX2-FP-NEXT:    vmovdqa %ymm0, (%rdx)
7063; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7064; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%rcx)
7065; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7066; AVX2-FP-NEXT:    vmovaps %ymm0, (%rcx)
7067; AVX2-FP-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
7068; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%r8)
7069; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7070; AVX2-FP-NEXT:    vmovaps %ymm0, (%r8)
7071; AVX2-FP-NEXT:    vmovdqa %ymm5, 32(%r9)
7072; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7073; AVX2-FP-NEXT:    vmovaps %ymm0, (%r9)
7074; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
7075; AVX2-FP-NEXT:    vmovdqa %ymm3, 32(%rax)
7076; AVX2-FP-NEXT:    vmovdqa %ymm4, (%rax)
7077; AVX2-FP-NEXT:    addq $328, %rsp # imm = 0x148
7078; AVX2-FP-NEXT:    vzeroupper
7079; AVX2-FP-NEXT:    retq
7080;
7081; AVX2-FCP-LABEL: load_i8_stride6_vf64:
7082; AVX2-FCP:       # %bb.0:
7083; AVX2-FCP-NEXT:    subq $328, %rsp # imm = 0x148
7084; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm7
7085; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm3
7086; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
7087; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7088; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
7089; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
7090; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0]
7091; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[0,1],ymm1[0,1]
7092; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7093; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
7094; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7095; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm2, %ymm0, %ymm4
7096; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
7097; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
7098; AVX2-FCP-NEXT:    vpblendvb %ymm1, %ymm3, %ymm5, %ymm2
7099; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm5
7100; AVX2-FCP-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
7101; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
7102; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm9
7103; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
7104; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
7105; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm3, %xmm12
7106; AVX2-FCP-NEXT:    vpor %xmm9, %xmm12, %xmm9
7107; AVX2-FCP-NEXT:    vpmovsxdq {{.*#+}} xmm12 = [18446744073709551615,16777215]
7108; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm9, %ymm0, %ymm0
7109; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7110; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm8
7111; AVX2-FCP-NEXT:    vpblendvb %ymm1, %ymm7, %ymm8, %ymm14
7112; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7113; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm14, %xmm0
7114; AVX2-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
7115; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm15, %xmm10
7116; AVX2-FCP-NEXT:    vpor %xmm0, %xmm10, %xmm1
7117; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm11
7118; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm0
7119; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm0[0,1],ymm11[0,1]
7120; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm11 = ymm0[2,3],ymm11[2,3]
7121; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm9, %ymm11, %ymm13
7122; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
7123; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm1, %ymm0, %ymm0
7124; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7125; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u]
7126; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm1
7127; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u]
7128; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
7129; AVX2-FCP-NEXT:    vpor %xmm1, %xmm3, %xmm1
7130; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11]
7131; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm4
7132; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm1, %ymm4, %ymm1
7133; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7134; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
7135; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm15, %xmm1
7136; AVX2-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
7137; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm13, %ymm1
7138; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm0, %ymm1, %ymm0
7139; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7140; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
7141; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
7142; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm10, %ymm5, %ymm1
7143; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
7144; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u]
7145; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm2, %xmm3
7146; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u]
7147; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm1, %xmm5
7148; AVX2-FCP-NEXT:    vpor %xmm3, %xmm5, %xmm5
7149; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12]
7150; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0]
7151; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7152; AVX2-FCP-NEXT:    vpblendvb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
7153; AVX2-FCP-NEXT:    vpshufb %ymm14, %ymm3, %ymm15
7154; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm5, %ymm15, %ymm5
7155; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7156; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm8, %ymm7, %ymm5
7157; AVX2-FCP-NEXT:    vmovdqa %ymm8, %ymm7
7158; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm15
7159; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm15, %xmm6
7160; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
7161; AVX2-FCP-NEXT:    vpor %xmm6, %xmm4, %xmm4
7162; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm11, %ymm9, %ymm0
7163; AVX2-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm6
7164; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm4, %ymm6, %ymm4
7165; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7166; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u]
7167; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
7168; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u]
7169; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
7170; AVX2-FCP-NEXT:    vpor %xmm2, %xmm1, %xmm1
7171; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13]
7172; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm3
7173; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm1, %ymm3, %ymm1
7174; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7175; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm15, %xmm1
7176; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm3
7177; AVX2-FCP-NEXT:    vpor %xmm1, %xmm3, %xmm1
7178; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
7179; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm1, %ymm0, %ymm14
7180; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm0
7181; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm3
7182; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
7183; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm3, %ymm1
7184; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7185; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
7186; AVX2-FCP-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm5
7187; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm0, %ymm3, %ymm15
7188; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm4
7189; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm6
7190; AVX2-FCP-NEXT:    vpblendvb %ymm1, %ymm6, %ymm4, %ymm1
7191; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm4, %ymm6, %ymm12
7192; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm4, %ymm6, %ymm0
7193; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7194; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm0 # 32-byte Reload
7195; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm10, %ymm0, %ymm8
7196; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7197; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm7, %ymm0, %ymm10
7198; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,65535,65535,0,0,65535,0,0,65535,0]
7199; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7200; AVX2-FCP-NEXT:    vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
7201; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm11, %ymm9, %ymm0
7202; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7203; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128]
7204; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm5, %xmm6
7205; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm11
7206; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12]
7207; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm11, %xmm9
7208; AVX2-FCP-NEXT:    vpor %xmm6, %xmm9, %xmm6
7209; AVX2-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
7210; AVX2-FCP-NEXT:    vpmovsxwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9
7211; AVX2-FCP-NEXT:    vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload
7212; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7213; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
7214; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm6
7215; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
7216; AVX2-FCP-NEXT:    vpor %xmm2, %xmm0, %xmm0
7217; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
7218; AVX2-FCP-NEXT:    vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7219; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7220; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128]
7221; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm2
7222; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13]
7223; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm11, %xmm11
7224; AVX2-FCP-NEXT:    vpor %xmm2, %xmm11, %xmm2
7225; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
7226; AVX2-FCP-NEXT:    vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
7227; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7228; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
7229; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm6, %xmm1
7230; AVX2-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
7231; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
7232; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm14, %ymm0, %ymm0
7233; AVX2-FCP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
7234; AVX2-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm14
7235; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14]
7236; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm14, %xmm0
7237; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128]
7238; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm15, %xmm1
7239; AVX2-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm1
7240; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm3
7241; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u]
7242; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
7243; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u]
7244; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm8, %xmm11
7245; AVX2-FCP-NEXT:    vpor %xmm6, %xmm11, %xmm6
7246; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm11 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14]
7247; AVX2-FCP-NEXT:    vpshufb %ymm11, %ymm13, %ymm0
7248; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm0[5,6,7]
7249; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
7250; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
7251; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm0, %ymm1, %ymm0
7252; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7253; AVX2-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm1
7254; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm1, %xmm0
7255; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
7256; AVX2-FCP-NEXT:    vpor %xmm0, %xmm2, %xmm0
7257; AVX2-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm2
7258; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm2, %xmm5
7259; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
7260; AVX2-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
7261; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
7262; AVX2-FCP-NEXT:    vpshufb %ymm11, %ymm6, %ymm5
7263; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
7264; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
7265; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
7266; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm4, %ymm0, %ymm5
7267; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15]
7268; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm14, %xmm4
7269; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128]
7270; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm15, %xmm11
7271; AVX2-FCP-NEXT:    vpor %xmm4, %xmm11, %xmm4
7272; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u]
7273; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm3, %xmm3
7274; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u]
7275; AVX2-FCP-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
7276; AVX2-FCP-NEXT:    vpor %xmm3, %xmm8, %xmm3
7277; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15]
7278; AVX2-FCP-NEXT:    vpshufb %ymm8, %ymm13, %ymm13
7279; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm13[5,6,7]
7280; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4,5,6,7]
7281; AVX2-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
7282; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm3, %ymm4, %ymm4
7283; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
7284; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm12, %xmm1
7285; AVX2-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
7286; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm2, %xmm1
7287; AVX2-FCP-NEXT:    vpshufb %xmm14, %xmm10, %xmm2
7288; AVX2-FCP-NEXT:    vpor %xmm1, %xmm2, %xmm1
7289; AVX2-FCP-NEXT:    vpshufb %ymm8, %ymm6, %ymm2
7290; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
7291; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
7292; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
7293; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm1, %ymm0, %ymm3
7294; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
7295; AVX2-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm0
7296; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10]
7297; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm2
7298; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128]
7299; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm8
7300; AVX2-FCP-NEXT:    vmovdqa %ymm9, %ymm10
7301; AVX2-FCP-NEXT:    vpor %xmm2, %xmm8, %xmm2
7302; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
7303; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
7304; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15]
7305; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
7306; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
7307; AVX2-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm8
7308; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
7309; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm7
7310; AVX2-FCP-NEXT:    vmovdqa %ymm9, %ymm11
7311; AVX2-FCP-NEXT:    vpor %xmm1, %xmm7, %xmm1
7312; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
7313; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
7314; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7],ymm7[8,9,10],ymm1[11,12,13,14,15]
7315; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
7316; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11]
7317; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
7318; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128]
7319; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm10, %xmm10
7320; AVX2-FCP-NEXT:    vpor %xmm0, %xmm10, %xmm0
7321; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
7322; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
7323; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15]
7324; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
7325; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm7
7326; AVX2-FCP-NEXT:    vpshufb %xmm9, %xmm11, %xmm8
7327; AVX2-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
7328; AVX2-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
7329; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
7330; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15]
7331; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
7332; AVX2-FCP-NEXT:    vmovdqa %ymm1, 32(%rsi)
7333; AVX2-FCP-NEXT:    vmovdqa %ymm2, (%rsi)
7334; AVX2-FCP-NEXT:    vmovdqa %ymm7, 32(%rdx)
7335; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%rdx)
7336; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7337; AVX2-FCP-NEXT:    vmovaps %ymm0, 32(%rcx)
7338; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7339; AVX2-FCP-NEXT:    vmovaps %ymm0, (%rcx)
7340; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
7341; AVX2-FCP-NEXT:    vmovaps %ymm0, 32(%r8)
7342; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7343; AVX2-FCP-NEXT:    vmovaps %ymm0, (%r8)
7344; AVX2-FCP-NEXT:    vmovdqa %ymm5, 32(%r9)
7345; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7346; AVX2-FCP-NEXT:    vmovaps %ymm0, (%r9)
7347; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
7348; AVX2-FCP-NEXT:    vmovdqa %ymm3, 32(%rax)
7349; AVX2-FCP-NEXT:    vmovdqa %ymm4, (%rax)
7350; AVX2-FCP-NEXT:    addq $328, %rsp # imm = 0x148
7351; AVX2-FCP-NEXT:    vzeroupper
7352; AVX2-FCP-NEXT:    retq
7353;
7354; AVX512-LABEL: load_i8_stride6_vf64:
7355; AVX512:       # %bb.0:
7356; AVX512-NEXT:    subq $40, %rsp
7357; AVX512-NEXT:    vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
7358; AVX512-NEXT:    vmovdqa64 224(%rdi), %ymm25
7359; AVX512-NEXT:    vmovdqa64 192(%rdi), %ymm26
7360; AVX512-NEXT:    vmovdqa %ymm12, %ymm0
7361; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25))
7362; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
7363; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm3
7364; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
7365; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm4
7366; AVX512-NEXT:    vpshufb %xmm5, %xmm4, %xmm6
7367; AVX512-NEXT:    vpor %xmm3, %xmm6, %xmm9
7368; AVX512-NEXT:    vmovdqa64 (%rdi), %ymm30
7369; AVX512-NEXT:    vmovdqa64 32(%rdi), %ymm31
7370; AVX512-NEXT:    vmovdqa64 128(%rdi), %ymm24
7371; AVX512-NEXT:    vmovdqa64 160(%rdi), %ymm18
7372; AVX512-NEXT:    vmovdqa %ymm12, %ymm6
7373; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm18 ^ ymm24))
7374; AVX512-NEXT:    vextracti128 $1, %ymm6, %xmm7
7375; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10]
7376; AVX512-NEXT:    vpshufb %xmm3, %xmm7, %xmm10
7377; AVX512-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128]
7378; AVX512-NEXT:    vpshufb %xmm8, %xmm6, %xmm13
7379; AVX512-NEXT:    vpor %xmm10, %xmm13, %xmm10
7380; AVX512-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
7381; AVX512-NEXT:    vinserti32x4 $2, %xmm9, %zmm10, %zmm2
7382; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7383; AVX512-NEXT:    vmovdqa %ymm12, %ymm9
7384; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm30 ^ ymm31))
7385; AVX512-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
7386; AVX512-NEXT:    vextracti128 $1, %ymm9, %xmm13
7387; AVX512-NEXT:    vpshufb %xmm5, %xmm13, %xmm5
7388; AVX512-NEXT:    vporq %xmm1, %xmm5, %xmm17
7389; AVX512-NEXT:    vmovdqa64 320(%rdi), %ymm29
7390; AVX512-NEXT:    vmovdqa64 352(%rdi), %ymm22
7391; AVX512-NEXT:    vmovdqa %ymm12, %ymm1
7392; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm29 ^ (ymm1 & (ymm22 ^ ymm29))
7393; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm5
7394; AVX512-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
7395; AVX512-NEXT:    vpshufb %xmm8, %xmm1, %xmm8
7396; AVX512-NEXT:    vpor %xmm3, %xmm8, %xmm3
7397; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
7398; AVX512-NEXT:    vmovdqa {{.*#+}} xmm8 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u]
7399; AVX512-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
7400; AVX512-NEXT:    vmovdqa {{.*#+}} xmm10 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u]
7401; AVX512-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
7402; AVX512-NEXT:    vpor %xmm0, %xmm4, %xmm0
7403; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7404; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11]
7405; AVX512-NEXT:    vpshufb %xmm0, %xmm7, %xmm4
7406; AVX512-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128]
7407; AVX512-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
7408; AVX512-NEXT:    vporq %xmm4, %xmm6, %xmm28
7409; AVX512-NEXT:    vpshufb %xmm8, %xmm9, %xmm4
7410; AVX512-NEXT:    vpshufb %xmm10, %xmm13, %xmm6
7411; AVX512-NEXT:    vporq %xmm4, %xmm6, %xmm21
7412; AVX512-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
7413; AVX512-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
7414; AVX512-NEXT:    vporq %xmm0, %xmm1, %xmm27
7415; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u]
7416; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
7417; AVX512-NEXT:    vmovdqa %ymm9, %ymm4
7418; AVX512-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm26 ^ (ymm4 & (ymm25 ^ ymm26))
7419; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm15
7420; AVX512-NEXT:    vpshufb %xmm0, %xmm15, %xmm1
7421; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u]
7422; AVX512-NEXT:    vpshufb %xmm6, %xmm4, %xmm5
7423; AVX512-NEXT:    vpor %xmm1, %xmm5, %xmm1
7424; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7425; AVX512-NEXT:    vmovdqa %ymm12, %ymm5
7426; AVX512-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm18 ^ (ymm5 & (ymm24 ^ ymm18))
7427; AVX512-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128]
7428; AVX512-NEXT:    vpshufb %xmm8, %xmm5, %xmm7
7429; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm1
7430; AVX512-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12]
7431; AVX512-NEXT:    vpshufb %xmm10, %xmm1, %xmm13
7432; AVX512-NEXT:    vpor %xmm7, %xmm13, %xmm2
7433; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7434; AVX512-NEXT:    vmovdqa %ymm9, %ymm13
7435; AVX512-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm30 ^ (ymm13 & (ymm31 ^ ymm30))
7436; AVX512-NEXT:    vextracti128 $1, %ymm13, %xmm14
7437; AVX512-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
7438; AVX512-NEXT:    vpshufb %xmm6, %xmm13, %xmm6
7439; AVX512-NEXT:    vporq %xmm0, %xmm6, %xmm16
7440; AVX512-NEXT:    vmovdqa %ymm12, %ymm11
7441; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm22 ^ (ymm11 & (ymm29 ^ ymm22))
7442; AVX512-NEXT:    vpshufb %xmm8, %xmm11, %xmm8
7443; AVX512-NEXT:    vextracti128 $1, %ymm11, %xmm7
7444; AVX512-NEXT:    vpshufb %xmm10, %xmm7, %xmm10
7445; AVX512-NEXT:    vpor %xmm8, %xmm10, %xmm0
7446; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7447; AVX512-NEXT:    vmovdqa {{.*#+}} xmm10 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u]
7448; AVX512-NEXT:    vpshufb %xmm10, %xmm15, %xmm15
7449; AVX512-NEXT:    vmovdqa {{.*#+}} xmm8 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u]
7450; AVX512-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
7451; AVX512-NEXT:    vpor %xmm4, %xmm15, %xmm0
7452; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7453; AVX512-NEXT:    vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero
7454; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[1,7,13]
7455; AVX512-NEXT:    vpor %xmm1, %xmm15, %xmm0
7456; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7457; AVX512-NEXT:    vmovdqa 256(%rdi), %ymm1
7458; AVX512-NEXT:    vshufi64x2 {{.*#+}} ymm19 = ymm1[2,3],mem[2,3]
7459; AVX512-NEXT:    vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20
7460; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0]
7461; AVX512-NEXT:    vmovdqa %ymm5, %ymm1
7462; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm19 ^ (ymm1 & (ymm20 ^ ymm19))
7463; AVX512-NEXT:    vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
7464; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15]
7465; AVX512-NEXT:    vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7]
7466; AVX512-NEXT:    vmovdqa64 64(%rdi), %ymm23
7467; AVX512-NEXT:    vshufi64x2 {{.*#+}} ymm6 = ymm23[2,3],mem[2,3]
7468; AVX512-NEXT:    vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23
7469; AVX512-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm15
7470; AVX512-NEXT:    vmovdqa %ymm5, %ymm2
7471; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm6 ^ (ymm2 & (ymm23 ^ ymm6))
7472; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
7473; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7474; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm17 & ymm4)
7475; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
7476; AVX512-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload
7477; AVX512-NEXT:    # zmm0 = mem ^ (zmm17 & (zmm0 ^ mem))
7478; AVX512-NEXT:    vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0]
7479; AVX512-NEXT:    vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm3 & (zmm15 ^ zmm0))
7480; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u]
7481; AVX512-NEXT:    vinserti32x4 $1, %xmm27, %ymm0, %ymm1
7482; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
7483; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7484; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7485; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm21 & ymm4)
7486; AVX512-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm2
7487; AVX512-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
7488; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm17 & (zmm1 ^ zmm2))
7489; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm17
7490; AVX512-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm3 & (zmm17 ^ zmm1))
7491; AVX512-NEXT:    vpshufb %xmm10, %xmm14, %xmm0
7492; AVX512-NEXT:    vpshufb %xmm8, %xmm13, %xmm1
7493; AVX512-NEXT:    vporq %xmm0, %xmm1, %xmm21
7494; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,1,7,13],zero,zero,zero,xmm11[5,11],zero,zero,zero
7495; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13]
7496; AVX512-NEXT:    vporq %xmm0, %xmm1, %xmm28
7497; AVX512-NEXT:    vmovdqa64 %ymm25, %ymm11
7498; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm12 & (ymm11 ^ ymm26))
7499; AVX512-NEXT:    vextracti128 $1, %ymm11, %xmm0
7500; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u]
7501; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u]
7502; AVX512-NEXT:    vpshufb %xmm3, %xmm11, %xmm2
7503; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm25
7504; AVX512-NEXT:    vporq %xmm1, %xmm2, %xmm26
7505; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm14
7506; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm24 ^ (ymm9 & (ymm14 ^ ymm24))
7507; AVX512-NEXT:    vextracti128 $1, %ymm14, %xmm10
7508; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14]
7509; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm2
7510; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128]
7511; AVX512-NEXT:    vpshufb %xmm3, %xmm14, %xmm4
7512; AVX512-NEXT:    vporq %xmm2, %xmm4, %xmm27
7513; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm31 ^ ymm30))
7514; AVX512-NEXT:    vmovdqa %ymm5, %ymm4
7515; AVX512-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm6 ^ ymm23))
7516; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm29 ^ (ymm9 & (ymm22 ^ ymm29))
7517; AVX512-NEXT:    vextracti128 $1, %ymm9, %xmm8
7518; AVX512-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
7519; AVX512-NEXT:    vpshufb %xmm3, %xmm9, %xmm2
7520; AVX512-NEXT:    vpor %xmm1, %xmm2, %xmm7
7521; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u]
7522; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
7523; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm22
7524; AVX512-NEXT:    vmovdqa {{.*#+}} xmm13 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u]
7525; AVX512-NEXT:    vpshufb %xmm13, %xmm11, %xmm1
7526; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm3
7527; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15]
7528; AVX512-NEXT:    vpshufb %xmm2, %xmm10, %xmm1
7529; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128]
7530; AVX512-NEXT:    vpshufb %xmm0, %xmm14, %xmm10
7531; AVX512-NEXT:    vpor %xmm1, %xmm10, %xmm10
7532; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128]
7533; AVX512-NEXT:    vpshufb %ymm1, %ymm4, %ymm11
7534; AVX512-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
7535; AVX512-NEXT:    vpternlogq {{.*#+}} ymm16 = (ymm16 & ymm18) | ymm11
7536; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128]
7537; AVX512-NEXT:    vpshufb %ymm11, %ymm4, %ymm4
7538; AVX512-NEXT:    vpternlogq {{.*#+}} ymm21 = (ymm21 & ymm18) | ymm4
7539; AVX512-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
7540; AVX512-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm20 ^ (ymm5 & (ymm19 ^ ymm20))
7541; AVX512-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
7542; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm4 & ymm18)
7543; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm4
7544; AVX512-NEXT:    vpshufb %ymm11, %ymm5, %ymm5
7545; AVX512-NEXT:    vextracti128 $1, %ymm12, %xmm1
7546; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[4,10,u,u,u,u,u,u]
7547; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm14
7548; AVX512-NEXT:    vpshufb %xmm14, %xmm12, %xmm14
7549; AVX512-NEXT:    vpor %xmm11, %xmm14, %xmm11
7550; AVX512-NEXT:    vpshufb %xmm2, %xmm8, %xmm2
7551; AVX512-NEXT:    vpshufb %xmm0, %xmm9, %xmm0
7552; AVX512-NEXT:    vpor %xmm2, %xmm0, %xmm0
7553; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
7554; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm23 ^ (ymm2 & (ymm6 ^ ymm23))
7555; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
7556; AVX512-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3,4],xmm8[5,6,7]
7557; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
7558; AVX512-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm9
7559; AVX512-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm9 & ymm18)
7560; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
7561; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm19 ^ ymm20))
7562; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
7563; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
7564; AVX512-NEXT:    vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7565; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm7 & ~ymm9)
7566; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm7
7567; AVX512-NEXT:    vinserti32x4 $1, %xmm27, %ymm0, %ymm11
7568; AVX512-NEXT:    vinserti32x4 $2, %xmm26, %zmm11, %zmm11
7569; AVX512-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm11 ^ (zmm9 & (zmm8 ^ zmm11))
7570; AVX512-NEXT:    vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
7571; AVX512-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm11 & (zmm7 ^ zmm8))
7572; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm8
7573; AVX512-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
7574; AVX512-NEXT:    vpshufb %xmm13, %xmm12, %xmm8
7575; AVX512-NEXT:    vpor %xmm1, %xmm8, %xmm1
7576; AVX512-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u]
7577; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7]
7578; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
7579; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
7580; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7581; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ~ymm9)
7582; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
7583; AVX512-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm2
7584; AVX512-NEXT:    vinserti32x4 $2, %xmm3, %zmm2, %zmm2
7585; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm9 & (zmm1 ^ zmm2))
7586; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm11 & (zmm0 ^ zmm1))
7587; AVX512-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
7588; AVX512-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
7589; AVX512-NEXT:    vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295]
7590; AVX512-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm2 & (zmm16 ^ zmm1))
7591; AVX512-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
7592; AVX512-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
7593; AVX512-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm2 & (zmm21 ^ zmm1))
7594; AVX512-NEXT:    vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0]
7595; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm16))
7596; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm21))
7597; AVX512-NEXT:    vmovdqa64 %zmm15, (%rsi)
7598; AVX512-NEXT:    vmovdqa64 %zmm17, (%rdx)
7599; AVX512-NEXT:    vmovdqa64 %zmm4, (%rcx)
7600; AVX512-NEXT:    vmovdqa64 %zmm5, (%r8)
7601; AVX512-NEXT:    vmovdqa64 %zmm7, (%r9)
7602; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
7603; AVX512-NEXT:    vmovdqa64 %zmm0, (%rax)
7604; AVX512-NEXT:    addq $40, %rsp
7605; AVX512-NEXT:    vzeroupper
7606; AVX512-NEXT:    retq
7607;
7608; AVX512-FCP-LABEL: load_i8_stride6_vf64:
7609; AVX512-FCP:       # %bb.0:
7610; AVX512-FCP-NEXT:    subq $40, %rsp
7611; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
7612; AVX512-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm25
7613; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm26
7614; AVX512-FCP-NEXT:    vmovdqa %ymm12, %ymm0
7615; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25))
7616; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
7617; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm3
7618; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
7619; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm4
7620; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm6
7621; AVX512-FCP-NEXT:    vpor %xmm3, %xmm6, %xmm9
7622; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %ymm30
7623; AVX512-FCP-NEXT:    vmovdqa64 32(%rdi), %ymm31
7624; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm24
7625; AVX512-FCP-NEXT:    vmovdqa64 160(%rdi), %ymm18
7626; AVX512-FCP-NEXT:    vmovdqa %ymm12, %ymm6
7627; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm18 ^ ymm24))
7628; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
7629; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10]
7630; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm7, %xmm10
7631; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128]
7632; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm13
7633; AVX512-FCP-NEXT:    vpor %xmm10, %xmm13, %xmm10
7634; AVX512-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
7635; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm9, %zmm10, %zmm2
7636; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7637; AVX512-FCP-NEXT:    vmovdqa %ymm12, %ymm9
7638; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm30 ^ ymm31))
7639; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
7640; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm13
7641; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm13, %xmm5
7642; AVX512-FCP-NEXT:    vporq %xmm1, %xmm5, %xmm17
7643; AVX512-FCP-NEXT:    vmovdqa64 320(%rdi), %ymm29
7644; AVX512-FCP-NEXT:    vmovdqa64 352(%rdi), %ymm22
7645; AVX512-FCP-NEXT:    vmovdqa %ymm12, %ymm1
7646; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm29 ^ (ymm1 & (ymm22 ^ ymm29))
7647; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm5
7648; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
7649; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm8
7650; AVX512-FCP-NEXT:    vpor %xmm3, %xmm8, %xmm3
7651; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
7652; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u]
7653; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
7654; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u]
7655; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
7656; AVX512-FCP-NEXT:    vpor %xmm0, %xmm4, %xmm0
7657; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7658; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11]
7659; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm4
7660; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128]
7661; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
7662; AVX512-FCP-NEXT:    vporq %xmm4, %xmm6, %xmm28
7663; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm9, %xmm4
7664; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm13, %xmm6
7665; AVX512-FCP-NEXT:    vporq %xmm4, %xmm6, %xmm21
7666; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
7667; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
7668; AVX512-FCP-NEXT:    vporq %xmm0, %xmm1, %xmm27
7669; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u]
7670; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
7671; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm4
7672; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm26 ^ (ymm4 & (ymm25 ^ ymm26))
7673; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm15
7674; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm15, %xmm1
7675; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u]
7676; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm4, %xmm5
7677; AVX512-FCP-NEXT:    vpor %xmm1, %xmm5, %xmm1
7678; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7679; AVX512-FCP-NEXT:    vmovdqa %ymm12, %ymm5
7680; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm18 ^ (ymm5 & (ymm24 ^ ymm18))
7681; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128]
7682; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm7
7683; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm1
7684; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12]
7685; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm13
7686; AVX512-FCP-NEXT:    vpor %xmm7, %xmm13, %xmm2
7687; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7688; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm13
7689; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm30 ^ (ymm13 & (ymm31 ^ ymm30))
7690; AVX512-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm14
7691; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
7692; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm13, %xmm6
7693; AVX512-FCP-NEXT:    vporq %xmm0, %xmm6, %xmm16
7694; AVX512-FCP-NEXT:    vmovdqa %ymm12, %ymm11
7695; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm22 ^ (ymm11 & (ymm29 ^ ymm22))
7696; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm11, %xmm8
7697; AVX512-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm7
7698; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm7, %xmm10
7699; AVX512-FCP-NEXT:    vpor %xmm8, %xmm10, %xmm0
7700; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7701; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u]
7702; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm15, %xmm15
7703; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u]
7704; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
7705; AVX512-FCP-NEXT:    vpor %xmm4, %xmm15, %xmm0
7706; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7707; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero
7708; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[1,7,13]
7709; AVX512-FCP-NEXT:    vpor %xmm1, %xmm15, %xmm0
7710; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7711; AVX512-FCP-NEXT:    vmovdqa 256(%rdi), %ymm1
7712; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} ymm19 = ymm1[2,3],mem[2,3]
7713; AVX512-FCP-NEXT:    vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20
7714; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0]
7715; AVX512-FCP-NEXT:    vmovdqa %ymm5, %ymm1
7716; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm19 ^ (ymm1 & (ymm20 ^ ymm19))
7717; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
7718; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15]
7719; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7]
7720; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm23
7721; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} ymm6 = ymm23[2,3],mem[2,3]
7722; AVX512-FCP-NEXT:    vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23
7723; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm15
7724; AVX512-FCP-NEXT:    vmovdqa %ymm5, %ymm2
7725; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm6 ^ (ymm2 & (ymm23 ^ ymm6))
7726; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
7727; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7728; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm17 & ymm4)
7729; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
7730; AVX512-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload
7731; AVX512-FCP-NEXT:    # zmm0 = mem ^ (zmm17 & (zmm0 ^ mem))
7732; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0]
7733; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm3 & (zmm15 ^ zmm0))
7734; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u]
7735; AVX512-FCP-NEXT:    vinserti32x4 $1, %xmm27, %ymm0, %ymm1
7736; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
7737; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7738; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7739; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm21 & ymm4)
7740; AVX512-FCP-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm2
7741; AVX512-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
7742; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm17 & (zmm1 ^ zmm2))
7743; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm17
7744; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm3 & (zmm17 ^ zmm1))
7745; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm14, %xmm0
7746; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm13, %xmm1
7747; AVX512-FCP-NEXT:    vporq %xmm0, %xmm1, %xmm21
7748; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,1,7,13],zero,zero,zero,xmm11[5,11],zero,zero,zero
7749; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13]
7750; AVX512-FCP-NEXT:    vporq %xmm0, %xmm1, %xmm28
7751; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm11
7752; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm12 & (ymm11 ^ ymm26))
7753; AVX512-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm0
7754; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u]
7755; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u]
7756; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm11, %xmm2
7757; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm25
7758; AVX512-FCP-NEXT:    vporq %xmm1, %xmm2, %xmm26
7759; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm14
7760; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm24 ^ (ymm9 & (ymm14 ^ ymm24))
7761; AVX512-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm10
7762; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14]
7763; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm2
7764; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128]
7765; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm14, %xmm4
7766; AVX512-FCP-NEXT:    vporq %xmm2, %xmm4, %xmm27
7767; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm31 ^ ymm30))
7768; AVX512-FCP-NEXT:    vmovdqa %ymm5, %ymm4
7769; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm6 ^ ymm23))
7770; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm29 ^ (ymm9 & (ymm22 ^ ymm29))
7771; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm8
7772; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
7773; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm9, %xmm2
7774; AVX512-FCP-NEXT:    vpor %xmm1, %xmm2, %xmm7
7775; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u]
7776; AVX512-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
7777; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm22
7778; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u]
7779; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm11, %xmm1
7780; AVX512-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm3
7781; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15]
7782; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm10, %xmm1
7783; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128]
7784; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm14, %xmm10
7785; AVX512-FCP-NEXT:    vpor %xmm1, %xmm10, %xmm10
7786; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128]
7787; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm11
7788; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
7789; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm16 = (ymm16 & ymm18) | ymm11
7790; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128]
7791; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm4, %ymm4
7792; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm21 = (ymm21 & ymm18) | ymm4
7793; AVX512-FCP-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
7794; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm20 ^ (ymm5 & (ymm19 ^ ymm20))
7795; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
7796; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm4 & ymm18)
7797; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm4
7798; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm5, %ymm5
7799; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm1
7800; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[4,10,u,u,u,u,u,u]
7801; AVX512-FCP-NEXT:    vmovdqa64 %xmm25, %xmm14
7802; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm12, %xmm14
7803; AVX512-FCP-NEXT:    vpor %xmm11, %xmm14, %xmm11
7804; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm8, %xmm2
7805; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm9, %xmm0
7806; AVX512-FCP-NEXT:    vpor %xmm2, %xmm0, %xmm0
7807; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
7808; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm23 ^ (ymm2 & (ymm6 ^ ymm23))
7809; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
7810; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3,4],xmm8[5,6,7]
7811; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
7812; AVX512-FCP-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm9
7813; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm9 & ymm18)
7814; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
7815; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm19 ^ ymm20))
7816; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
7817; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
7818; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7819; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm7 & ~ymm9)
7820; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm7
7821; AVX512-FCP-NEXT:    vinserti32x4 $1, %xmm27, %ymm0, %ymm11
7822; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm26, %zmm11, %zmm11
7823; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm11 ^ (zmm9 & (zmm8 ^ zmm11))
7824; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
7825; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm11 & (zmm7 ^ zmm8))
7826; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm8
7827; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
7828; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm12, %xmm8
7829; AVX512-FCP-NEXT:    vpor %xmm1, %xmm8, %xmm1
7830; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u]
7831; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7]
7832; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
7833; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
7834; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7835; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ~ymm9)
7836; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
7837; AVX512-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm2
7838; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm3, %zmm2, %zmm2
7839; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm9 & (zmm1 ^ zmm2))
7840; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm11 & (zmm0 ^ zmm1))
7841; AVX512-FCP-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
7842; AVX512-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
7843; AVX512-FCP-NEXT:    vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295]
7844; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm2 & (zmm16 ^ zmm1))
7845; AVX512-FCP-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
7846; AVX512-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
7847; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm2 & (zmm21 ^ zmm1))
7848; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0]
7849; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm16))
7850; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm21))
7851; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, (%rsi)
7852; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, (%rdx)
7853; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rcx)
7854; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%r8)
7855; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, (%r9)
7856; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
7857; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
7858; AVX512-FCP-NEXT:    addq $40, %rsp
7859; AVX512-FCP-NEXT:    vzeroupper
7860; AVX512-FCP-NEXT:    retq
7861;
7862; AVX512DQ-LABEL: load_i8_stride6_vf64:
7863; AVX512DQ:       # %bb.0:
7864; AVX512DQ-NEXT:    subq $40, %rsp
7865; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
7866; AVX512DQ-NEXT:    vmovdqa64 224(%rdi), %ymm25
7867; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %ymm26
7868; AVX512DQ-NEXT:    vmovdqa %ymm12, %ymm0
7869; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25))
7870; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
7871; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm3
7872; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
7873; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm4
7874; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm4, %xmm6
7875; AVX512DQ-NEXT:    vpor %xmm3, %xmm6, %xmm9
7876; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %ymm30
7877; AVX512DQ-NEXT:    vmovdqa64 32(%rdi), %ymm31
7878; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %ymm24
7879; AVX512DQ-NEXT:    vmovdqa64 160(%rdi), %ymm18
7880; AVX512DQ-NEXT:    vmovdqa %ymm12, %ymm6
7881; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm18 ^ ymm24))
7882; AVX512DQ-NEXT:    vextracti128 $1, %ymm6, %xmm7
7883; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10]
7884; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm7, %xmm10
7885; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128]
7886; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm6, %xmm13
7887; AVX512DQ-NEXT:    vpor %xmm10, %xmm13, %xmm10
7888; AVX512DQ-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
7889; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm9, %zmm10, %zmm2
7890; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7891; AVX512DQ-NEXT:    vmovdqa %ymm12, %ymm9
7892; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm30 ^ ymm31))
7893; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
7894; AVX512DQ-NEXT:    vextracti128 $1, %ymm9, %xmm13
7895; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm13, %xmm5
7896; AVX512DQ-NEXT:    vporq %xmm1, %xmm5, %xmm17
7897; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %ymm29
7898; AVX512DQ-NEXT:    vmovdqa64 352(%rdi), %ymm22
7899; AVX512DQ-NEXT:    vmovdqa %ymm12, %ymm1
7900; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm29 ^ (ymm1 & (ymm22 ^ ymm29))
7901; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm5
7902; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
7903; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm1, %xmm8
7904; AVX512DQ-NEXT:    vpor %xmm3, %xmm8, %xmm3
7905; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
7906; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm8 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u]
7907; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
7908; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm10 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u]
7909; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
7910; AVX512DQ-NEXT:    vpor %xmm0, %xmm4, %xmm0
7911; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7912; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11]
7913; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm7, %xmm4
7914; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128]
7915; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
7916; AVX512DQ-NEXT:    vporq %xmm4, %xmm6, %xmm28
7917; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm9, %xmm4
7918; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm13, %xmm6
7919; AVX512DQ-NEXT:    vporq %xmm4, %xmm6, %xmm21
7920; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
7921; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
7922; AVX512DQ-NEXT:    vporq %xmm0, %xmm1, %xmm27
7923; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u]
7924; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
7925; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm4
7926; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm26 ^ (ymm4 & (ymm25 ^ ymm26))
7927; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm15
7928; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm15, %xmm1
7929; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u]
7930; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm4, %xmm5
7931; AVX512DQ-NEXT:    vpor %xmm1, %xmm5, %xmm1
7932; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7933; AVX512DQ-NEXT:    vmovdqa %ymm12, %ymm5
7934; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm18 ^ (ymm5 & (ymm24 ^ ymm18))
7935; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128]
7936; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm5, %xmm7
7937; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm1
7938; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12]
7939; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm1, %xmm13
7940; AVX512DQ-NEXT:    vpor %xmm7, %xmm13, %xmm2
7941; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7942; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm13
7943; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm30 ^ (ymm13 & (ymm31 ^ ymm30))
7944; AVX512DQ-NEXT:    vextracti128 $1, %ymm13, %xmm14
7945; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
7946; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm13, %xmm6
7947; AVX512DQ-NEXT:    vporq %xmm0, %xmm6, %xmm16
7948; AVX512DQ-NEXT:    vmovdqa %ymm12, %ymm11
7949; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm22 ^ (ymm11 & (ymm29 ^ ymm22))
7950; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm11, %xmm8
7951; AVX512DQ-NEXT:    vextracti128 $1, %ymm11, %xmm7
7952; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm7, %xmm10
7953; AVX512DQ-NEXT:    vpor %xmm8, %xmm10, %xmm0
7954; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7955; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm10 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u]
7956; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm15, %xmm15
7957; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm8 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u]
7958; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
7959; AVX512DQ-NEXT:    vpor %xmm4, %xmm15, %xmm0
7960; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7961; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero
7962; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[1,7,13]
7963; AVX512DQ-NEXT:    vpor %xmm1, %xmm15, %xmm0
7964; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7965; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %ymm1
7966; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} ymm19 = ymm1[2,3],mem[2,3]
7967; AVX512DQ-NEXT:    vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20
7968; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0]
7969; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm1
7970; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm19 ^ (ymm1 & (ymm20 ^ ymm19))
7971; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
7972; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15]
7973; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7]
7974; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %ymm23
7975; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} ymm6 = ymm23[2,3],mem[2,3]
7976; AVX512DQ-NEXT:    vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23
7977; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm15
7978; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm2
7979; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm6 ^ (ymm2 & (ymm23 ^ ymm6))
7980; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
7981; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7982; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm17 & ymm4)
7983; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
7984; AVX512DQ-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload
7985; AVX512DQ-NEXT:    # zmm0 = mem ^ (zmm17 & (zmm0 ^ mem))
7986; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0]
7987; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm3 & (zmm15 ^ zmm0))
7988; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u]
7989; AVX512DQ-NEXT:    vinserti32x4 $1, %xmm27, %ymm0, %ymm1
7990; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
7991; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7992; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7993; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm21 & ymm4)
7994; AVX512DQ-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm2
7995; AVX512DQ-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
7996; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm17 & (zmm1 ^ zmm2))
7997; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm17
7998; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm3 & (zmm17 ^ zmm1))
7999; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm14, %xmm0
8000; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm13, %xmm1
8001; AVX512DQ-NEXT:    vporq %xmm0, %xmm1, %xmm21
8002; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,1,7,13],zero,zero,zero,xmm11[5,11],zero,zero,zero
8003; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13]
8004; AVX512DQ-NEXT:    vporq %xmm0, %xmm1, %xmm28
8005; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm11
8006; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm12 & (ymm11 ^ ymm26))
8007; AVX512DQ-NEXT:    vextracti128 $1, %ymm11, %xmm0
8008; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u]
8009; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u]
8010; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm11, %xmm2
8011; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm25
8012; AVX512DQ-NEXT:    vporq %xmm1, %xmm2, %xmm26
8013; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm14
8014; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm24 ^ (ymm9 & (ymm14 ^ ymm24))
8015; AVX512DQ-NEXT:    vextracti128 $1, %ymm14, %xmm10
8016; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14]
8017; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm10, %xmm2
8018; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128]
8019; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm14, %xmm4
8020; AVX512DQ-NEXT:    vporq %xmm2, %xmm4, %xmm27
8021; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm31 ^ ymm30))
8022; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm4
8023; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm6 ^ ymm23))
8024; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm29 ^ (ymm9 & (ymm22 ^ ymm29))
8025; AVX512DQ-NEXT:    vextracti128 $1, %ymm9, %xmm8
8026; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
8027; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm9, %xmm2
8028; AVX512DQ-NEXT:    vpor %xmm1, %xmm2, %xmm7
8029; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u]
8030; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
8031; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm22
8032; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm13 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u]
8033; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm11, %xmm1
8034; AVX512DQ-NEXT:    vpor %xmm0, %xmm1, %xmm3
8035; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15]
8036; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm10, %xmm1
8037; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128]
8038; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm14, %xmm10
8039; AVX512DQ-NEXT:    vpor %xmm1, %xmm10, %xmm10
8040; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128]
8041; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm4, %ymm11
8042; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
8043; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm16 = (ymm16 & ymm18) | ymm11
8044; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128]
8045; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm4, %ymm4
8046; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm21 = (ymm21 & ymm18) | ymm4
8047; AVX512DQ-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
8048; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm20 ^ (ymm5 & (ymm19 ^ ymm20))
8049; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
8050; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm4 & ymm18)
8051; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm4
8052; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm5, %ymm5
8053; AVX512DQ-NEXT:    vextracti128 $1, %ymm12, %xmm1
8054; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[4,10,u,u,u,u,u,u]
8055; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm14
8056; AVX512DQ-NEXT:    vpshufb %xmm14, %xmm12, %xmm14
8057; AVX512DQ-NEXT:    vpor %xmm11, %xmm14, %xmm11
8058; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm8, %xmm2
8059; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm9, %xmm0
8060; AVX512DQ-NEXT:    vpor %xmm2, %xmm0, %xmm0
8061; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
8062; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm23 ^ (ymm2 & (ymm6 ^ ymm23))
8063; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
8064; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3,4],xmm8[5,6,7]
8065; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
8066; AVX512DQ-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm9
8067; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm9 & ymm18)
8068; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
8069; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm19 ^ ymm20))
8070; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
8071; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
8072; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8073; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm7 & ~ymm9)
8074; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm7
8075; AVX512DQ-NEXT:    vinserti32x4 $1, %xmm27, %ymm0, %ymm11
8076; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm26, %zmm11, %zmm11
8077; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm11 ^ (zmm9 & (zmm8 ^ zmm11))
8078; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
8079; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm11 & (zmm7 ^ zmm8))
8080; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm8
8081; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
8082; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm12, %xmm8
8083; AVX512DQ-NEXT:    vpor %xmm1, %xmm8, %xmm1
8084; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u]
8085; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7]
8086; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
8087; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
8088; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8089; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ~ymm9)
8090; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
8091; AVX512DQ-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm2
8092; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm3, %zmm2, %zmm2
8093; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm9 & (zmm1 ^ zmm2))
8094; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm11 & (zmm0 ^ zmm1))
8095; AVX512DQ-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
8096; AVX512DQ-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
8097; AVX512DQ-NEXT:    vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295]
8098; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm2 & (zmm16 ^ zmm1))
8099; AVX512DQ-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
8100; AVX512DQ-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
8101; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm2 & (zmm21 ^ zmm1))
8102; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0]
8103; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm16))
8104; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm21))
8105; AVX512DQ-NEXT:    vmovdqa64 %zmm15, (%rsi)
8106; AVX512DQ-NEXT:    vmovdqa64 %zmm17, (%rdx)
8107; AVX512DQ-NEXT:    vmovdqa64 %zmm4, (%rcx)
8108; AVX512DQ-NEXT:    vmovdqa64 %zmm5, (%r8)
8109; AVX512DQ-NEXT:    vmovdqa64 %zmm7, (%r9)
8110; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
8111; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rax)
8112; AVX512DQ-NEXT:    addq $40, %rsp
8113; AVX512DQ-NEXT:    vzeroupper
8114; AVX512DQ-NEXT:    retq
8115;
8116; AVX512DQ-FCP-LABEL: load_i8_stride6_vf64:
8117; AVX512DQ-FCP:       # %bb.0:
8118; AVX512DQ-FCP-NEXT:    subq $40, %rsp
8119; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
8120; AVX512DQ-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm25
8121; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm26
8122; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, %ymm0
8123; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25))
8124; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
8125; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm3
8126; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
8127; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm4
8128; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm6
8129; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm6, %xmm9
8130; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %ymm30
8131; AVX512DQ-FCP-NEXT:    vmovdqa64 32(%rdi), %ymm31
8132; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm24
8133; AVX512DQ-FCP-NEXT:    vmovdqa64 160(%rdi), %ymm18
8134; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, %ymm6
8135; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm18 ^ ymm24))
8136; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
8137; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10]
8138; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm7, %xmm10
8139; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128]
8140; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm13
8141; AVX512DQ-FCP-NEXT:    vpor %xmm10, %xmm13, %xmm10
8142; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
8143; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm9, %zmm10, %zmm2
8144; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8145; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, %ymm9
8146; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm30 ^ ymm31))
8147; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
8148; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm13
8149; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm13, %xmm5
8150; AVX512DQ-FCP-NEXT:    vporq %xmm1, %xmm5, %xmm17
8151; AVX512DQ-FCP-NEXT:    vmovdqa64 320(%rdi), %ymm29
8152; AVX512DQ-FCP-NEXT:    vmovdqa64 352(%rdi), %ymm22
8153; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, %ymm1
8154; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm29 ^ (ymm1 & (ymm22 ^ ymm29))
8155; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm5
8156; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
8157; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm8
8158; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm8, %xmm3
8159; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
8160; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u]
8161; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
8162; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u]
8163; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
8164; AVX512DQ-FCP-NEXT:    vpor %xmm0, %xmm4, %xmm0
8165; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8166; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11]
8167; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm4
8168; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128]
8169; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
8170; AVX512DQ-FCP-NEXT:    vporq %xmm4, %xmm6, %xmm28
8171; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm9, %xmm4
8172; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm13, %xmm6
8173; AVX512DQ-FCP-NEXT:    vporq %xmm4, %xmm6, %xmm21
8174; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
8175; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
8176; AVX512DQ-FCP-NEXT:    vporq %xmm0, %xmm1, %xmm27
8177; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u]
8178; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
8179; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm4
8180; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm26 ^ (ymm4 & (ymm25 ^ ymm26))
8181; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm15
8182; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm15, %xmm1
8183; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u]
8184; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm4, %xmm5
8185; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm5, %xmm1
8186; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8187; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, %ymm5
8188; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm18 ^ (ymm5 & (ymm24 ^ ymm18))
8189; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128]
8190; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm7
8191; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm1
8192; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12]
8193; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm13
8194; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm13, %xmm2
8195; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8196; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm13
8197; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm30 ^ (ymm13 & (ymm31 ^ ymm30))
8198; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm14
8199; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
8200; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm13, %xmm6
8201; AVX512DQ-FCP-NEXT:    vporq %xmm0, %xmm6, %xmm16
8202; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, %ymm11
8203; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm22 ^ (ymm11 & (ymm29 ^ ymm22))
8204; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm11, %xmm8
8205; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm7
8206; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm7, %xmm10
8207; AVX512DQ-FCP-NEXT:    vpor %xmm8, %xmm10, %xmm0
8208; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8209; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u]
8210; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm15, %xmm15
8211; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u]
8212; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
8213; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm15, %xmm0
8214; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8215; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero
8216; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[1,7,13]
8217; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm15, %xmm0
8218; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8219; AVX512DQ-FCP-NEXT:    vmovdqa 256(%rdi), %ymm1
8220; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} ymm19 = ymm1[2,3],mem[2,3]
8221; AVX512DQ-FCP-NEXT:    vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20
8222; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0]
8223; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm1
8224; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm19 ^ (ymm1 & (ymm20 ^ ymm19))
8225; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
8226; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15]
8227; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7]
8228; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm23
8229; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} ymm6 = ymm23[2,3],mem[2,3]
8230; AVX512DQ-FCP-NEXT:    vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23
8231; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm15
8232; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm2
8233; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm6 ^ (ymm2 & (ymm23 ^ ymm6))
8234; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
8235; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8236; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm17 & ymm4)
8237; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
8238; AVX512DQ-FCP-NEXT:    vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload
8239; AVX512DQ-FCP-NEXT:    # zmm0 = mem ^ (zmm17 & (zmm0 ^ mem))
8240; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0]
8241; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm3 & (zmm15 ^ zmm0))
8242; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u]
8243; AVX512DQ-FCP-NEXT:    vinserti32x4 $1, %xmm27, %ymm0, %ymm1
8244; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
8245; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
8246; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8247; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm21 & ymm4)
8248; AVX512DQ-FCP-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm2
8249; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
8250; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm17 & (zmm1 ^ zmm2))
8251; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm17
8252; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm3 & (zmm17 ^ zmm1))
8253; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm14, %xmm0
8254; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm13, %xmm1
8255; AVX512DQ-FCP-NEXT:    vporq %xmm0, %xmm1, %xmm21
8256; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,1,7,13],zero,zero,zero,xmm11[5,11],zero,zero,zero
8257; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13]
8258; AVX512DQ-FCP-NEXT:    vporq %xmm0, %xmm1, %xmm28
8259; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm11
8260; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm12 & (ymm11 ^ ymm26))
8261; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm0
8262; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u]
8263; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u]
8264; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm11, %xmm2
8265; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm25
8266; AVX512DQ-FCP-NEXT:    vporq %xmm1, %xmm2, %xmm26
8267; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm14
8268; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm24 ^ (ymm9 & (ymm14 ^ ymm24))
8269; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm10
8270; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14]
8271; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm10, %xmm2
8272; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128]
8273; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm14, %xmm4
8274; AVX512DQ-FCP-NEXT:    vporq %xmm2, %xmm4, %xmm27
8275; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm31 ^ ymm30))
8276; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm4
8277; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm6 ^ ymm23))
8278; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm29 ^ (ymm9 & (ymm22 ^ ymm29))
8279; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm8
8280; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
8281; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm9, %xmm2
8282; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm2, %xmm7
8283; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u]
8284; AVX512DQ-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
8285; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm22
8286; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u]
8287; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm11, %xmm1
8288; AVX512DQ-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm3
8289; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15]
8290; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm10, %xmm1
8291; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128]
8292; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm14, %xmm10
8293; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm10, %xmm10
8294; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128]
8295; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm11
8296; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
8297; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm16 = (ymm16 & ymm18) | ymm11
8298; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128]
8299; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm4, %ymm4
8300; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm21 = (ymm21 & ymm18) | ymm4
8301; AVX512DQ-FCP-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
8302; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm20 ^ (ymm5 & (ymm19 ^ ymm20))
8303; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
8304; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm4 & ymm18)
8305; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm4
8306; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm5, %ymm5
8307; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm1
8308; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[4,10,u,u,u,u,u,u]
8309; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm25, %xmm14
8310; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm12, %xmm14
8311; AVX512DQ-FCP-NEXT:    vpor %xmm11, %xmm14, %xmm11
8312; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm8, %xmm2
8313; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm9, %xmm0
8314; AVX512DQ-FCP-NEXT:    vpor %xmm2, %xmm0, %xmm0
8315; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
8316; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm23 ^ (ymm2 & (ymm6 ^ ymm23))
8317; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
8318; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3,4],xmm8[5,6,7]
8319; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
8320; AVX512DQ-FCP-NEXT:    vinserti32x4 $1, %xmm28, %ymm0, %ymm9
8321; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm9 & ymm18)
8322; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
8323; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm19 ^ ymm20))
8324; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
8325; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
8326; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8327; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm7 & ~ymm9)
8328; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm7
8329; AVX512DQ-FCP-NEXT:    vinserti32x4 $1, %xmm27, %ymm0, %ymm11
8330; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm26, %zmm11, %zmm11
8331; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm11 ^ (zmm9 & (zmm8 ^ zmm11))
8332; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm11 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
8333; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm11 & (zmm7 ^ zmm8))
8334; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm8
8335; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
8336; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm12, %xmm8
8337; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm8, %xmm1
8338; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u]
8339; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7]
8340; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
8341; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
8342; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8343; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ~ymm9)
8344; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
8345; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm2
8346; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm3, %zmm2, %zmm2
8347; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm9 & (zmm1 ^ zmm2))
8348; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm11 & (zmm0 ^ zmm1))
8349; AVX512DQ-FCP-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
8350; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
8351; AVX512DQ-FCP-NEXT:    vpmovsxwd {{.*#+}} zmm2 = [0,0,0,0,0,4294967040,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295]
8352; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm2 & (zmm16 ^ zmm1))
8353; AVX512DQ-FCP-NEXT:    vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
8354; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
8355; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm2 & (zmm21 ^ zmm1))
8356; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm1 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,16777215,0,0]
8357; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm16))
8358; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm21))
8359; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, (%rsi)
8360; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, (%rdx)
8361; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, (%rcx)
8362; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, (%r8)
8363; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, (%r9)
8364; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
8365; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
8366; AVX512DQ-FCP-NEXT:    addq $40, %rsp
8367; AVX512DQ-FCP-NEXT:    vzeroupper
8368; AVX512DQ-FCP-NEXT:    retq
8369;
8370; AVX512BW-LABEL: load_i8_stride6_vf64:
8371; AVX512BW:       # %bb.0:
8372; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
8373; AVX512BW-NEXT:    vmovdqa 224(%rdi), %ymm0
8374; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %ymm23
8375; AVX512BW-NEXT:    movw $18724, %r10w # imm = 0x4924
8376; AVX512BW-NEXT:    kmovd %r10d, %k1
8377; AVX512BW-NEXT:    vpblendmw %ymm0, %ymm23, %ymm9 {%k1}
8378; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
8379; AVX512BW-NEXT:    vpshufb %xmm2, %xmm9, %xmm1
8380; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
8381; AVX512BW-NEXT:    vextracti128 $1, %ymm9, %xmm12
8382; AVX512BW-NEXT:    vpshufb %xmm4, %xmm12, %xmm3
8383; AVX512BW-NEXT:    vpor %xmm1, %xmm3, %xmm5
8384; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm10
8385; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm3
8386; AVX512BW-NEXT:    vmovdqa 64(%rdi), %ymm6
8387; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %ymm26
8388; AVX512BW-NEXT:    vmovdqa 160(%rdi), %ymm1
8389; AVX512BW-NEXT:    vpblendmw %ymm26, %ymm1, %ymm15 {%k1}
8390; AVX512BW-NEXT:    vextracti32x4 $1, %ymm15, %xmm16
8391; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10]
8392; AVX512BW-NEXT:    vpshufb %xmm17, %xmm16, %xmm11
8393; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm18 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128]
8394; AVX512BW-NEXT:    vpshufb %xmm18, %xmm15, %xmm13
8395; AVX512BW-NEXT:    vpor %xmm11, %xmm13, %xmm11
8396; AVX512BW-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
8397; AVX512BW-NEXT:    vinserti32x4 $2, %xmm5, %zmm11, %zmm11
8398; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],mem[2,3]
8399; AVX512BW-NEXT:    vinserti128 $1, 96(%rdi), %ymm6, %ymm13
8400; AVX512BW-NEXT:    movw $-28124, %r10w # imm = 0x9224
8401; AVX512BW-NEXT:    kmovd %r10d, %k4
8402; AVX512BW-NEXT:    vpblendmw %ymm5, %ymm13, %ymm19 {%k4}
8403; AVX512BW-NEXT:    vpblendmw %ymm3, %ymm10, %ymm20 {%k1}
8404; AVX512BW-NEXT:    vpshufb %xmm2, %xmm20, %xmm2
8405; AVX512BW-NEXT:    vextracti32x4 $1, %ymm20, %xmm21
8406; AVX512BW-NEXT:    vpshufb %xmm4, %xmm21, %xmm4
8407; AVX512BW-NEXT:    vpor %xmm2, %xmm4, %xmm2
8408; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10]
8409; AVX512BW-NEXT:    movl $4192256, %r10d # imm = 0x3FF800
8410; AVX512BW-NEXT:    kmovd %r10d, %k2
8411; AVX512BW-NEXT:    vpshufb %ymm6, %ymm19, %ymm2 {%k2}
8412; AVX512BW-NEXT:    vmovdqu16 %zmm11, %zmm2 {%k2}
8413; AVX512BW-NEXT:    vmovdqa 256(%rdi), %ymm11
8414; AVX512BW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm11[2,3],mem[2,3]
8415; AVX512BW-NEXT:    vinserti128 $1, 288(%rdi), %ymm11, %ymm14
8416; AVX512BW-NEXT:    vpblendmw %ymm4, %ymm14, %ymm22 {%k4}
8417; AVX512BW-NEXT:    vpshufb %ymm6, %ymm22, %ymm7
8418; AVX512BW-NEXT:    vmovdqa 320(%rdi), %ymm11
8419; AVX512BW-NEXT:    vmovdqa 352(%rdi), %ymm6
8420; AVX512BW-NEXT:    vpblendmw %ymm11, %ymm6, %ymm24 {%k1}
8421; AVX512BW-NEXT:    vextracti32x4 $1, %ymm24, %xmm25
8422; AVX512BW-NEXT:    vpshufb %xmm17, %xmm25, %xmm17
8423; AVX512BW-NEXT:    vpshufb %xmm18, %xmm24, %xmm18
8424; AVX512BW-NEXT:    vporq %xmm17, %xmm18, %xmm17
8425; AVX512BW-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm8
8426; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15]
8427; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
8428; AVX512BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
8429; AVX512BW-NEXT:    movabsq $-8796093022208, %rdi # imm = 0xFFFFF80000000000
8430; AVX512BW-NEXT:    kmovq %rdi, %k3
8431; AVX512BW-NEXT:    vmovdqu8 %zmm7, %zmm2 {%k3}
8432; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm7 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u]
8433; AVX512BW-NEXT:    vpshufb %xmm7, %xmm9, %xmm8
8434; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm9 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u]
8435; AVX512BW-NEXT:    vpshufb %xmm9, %xmm12, %xmm12
8436; AVX512BW-NEXT:    vpor %xmm8, %xmm12, %xmm8
8437; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11]
8438; AVX512BW-NEXT:    vpshufb %xmm12, %xmm16, %xmm16
8439; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128]
8440; AVX512BW-NEXT:    vpshufb %xmm17, %xmm15, %xmm15
8441; AVX512BW-NEXT:    vporq %xmm16, %xmm15, %xmm15
8442; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
8443; AVX512BW-NEXT:    vinserti32x4 $2, %xmm8, %zmm15, %zmm8
8444; AVX512BW-NEXT:    vpshufb %xmm7, %xmm20, %xmm7
8445; AVX512BW-NEXT:    vpshufb %xmm9, %xmm21, %xmm9
8446; AVX512BW-NEXT:    vpor %xmm7, %xmm9, %xmm9
8447; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11]
8448; AVX512BW-NEXT:    vpshufb %ymm7, %ymm19, %ymm9 {%k2}
8449; AVX512BW-NEXT:    vmovdqu16 %zmm8, %zmm9 {%k2}
8450; AVX512BW-NEXT:    vpshufb %ymm7, %ymm22, %ymm7
8451; AVX512BW-NEXT:    vpshufb %xmm12, %xmm25, %xmm8
8452; AVX512BW-NEXT:    vpshufb %xmm17, %xmm24, %xmm12
8453; AVX512BW-NEXT:    vpor %xmm8, %xmm12, %xmm8
8454; AVX512BW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
8455; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15]
8456; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
8457; AVX512BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
8458; AVX512BW-NEXT:    vmovdqu8 %zmm7, %zmm9 {%k3}
8459; AVX512BW-NEXT:    vpblendmw %ymm13, %ymm5, %ymm15 {%k4}
8460; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u]
8461; AVX512BW-NEXT:    movw $9362, %di # imm = 0x2492
8462; AVX512BW-NEXT:    kmovd %edi, %k2
8463; AVX512BW-NEXT:    vpblendmw %ymm10, %ymm3, %ymm8 {%k2}
8464; AVX512BW-NEXT:    vextracti32x4 $1, %ymm8, %xmm16
8465; AVX512BW-NEXT:    vpshufb %xmm7, %xmm16, %xmm12
8466; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u]
8467; AVX512BW-NEXT:    vpshufb %xmm17, %xmm8, %xmm18
8468; AVX512BW-NEXT:    vporq %xmm12, %xmm18, %xmm18
8469; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm19 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12]
8470; AVX512BW-NEXT:    movl $2095104, %edi # imm = 0x1FF800
8471; AVX512BW-NEXT:    kmovd %edi, %k5
8472; AVX512BW-NEXT:    vpshufb %ymm19, %ymm15, %ymm18 {%k5}
8473; AVX512BW-NEXT:    vpblendmw %ymm23, %ymm0, %ymm20 {%k2}
8474; AVX512BW-NEXT:    vextracti32x4 $1, %ymm20, %xmm21
8475; AVX512BW-NEXT:    vpshufb %xmm7, %xmm21, %xmm7
8476; AVX512BW-NEXT:    vpshufb %xmm17, %xmm20, %xmm12
8477; AVX512BW-NEXT:    vpor %xmm7, %xmm12, %xmm7
8478; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm26, %ymm17 {%k1}
8479; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm22 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128]
8480; AVX512BW-NEXT:    vpshufb %xmm22, %xmm17, %xmm12
8481; AVX512BW-NEXT:    vextracti32x4 $1, %ymm17, %xmm24
8482; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm25 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12]
8483; AVX512BW-NEXT:    vpshufb %xmm25, %xmm24, %xmm27
8484; AVX512BW-NEXT:    vporq %xmm12, %xmm27, %xmm12
8485; AVX512BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
8486; AVX512BW-NEXT:    vinserti32x4 $2, %xmm7, %zmm12, %zmm12
8487; AVX512BW-NEXT:    movl $2097151, %edi # imm = 0x1FFFFF
8488; AVX512BW-NEXT:    kmovq %rdi, %k6
8489; AVX512BW-NEXT:    vmovdqu8 %zmm18, %zmm12 {%k6}
8490; AVX512BW-NEXT:    vpblendmw %ymm14, %ymm4, %ymm7 {%k4}
8491; AVX512BW-NEXT:    vpblendmw %ymm6, %ymm11, %ymm18 {%k1}
8492; AVX512BW-NEXT:    vpshufb %xmm22, %xmm18, %xmm22
8493; AVX512BW-NEXT:    vextracti32x4 $1, %ymm18, %xmm27
8494; AVX512BW-NEXT:    vpshufb %xmm25, %xmm27, %xmm25
8495; AVX512BW-NEXT:    vporq %xmm22, %xmm25, %xmm22
8496; AVX512BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm22
8497; AVX512BW-NEXT:    vpshufb %ymm19, %ymm7, %ymm22 {%k5}
8498; AVX512BW-NEXT:    vinserti64x4 $1, %ymm22, %zmm0, %zmm19
8499; AVX512BW-NEXT:    vmovdqu8 %zmm19, %zmm12 {%k3}
8500; AVX512BW-NEXT:    movw $9289, %di # imm = 0x2449
8501; AVX512BW-NEXT:    kmovd %edi, %k4
8502; AVX512BW-NEXT:    vmovdqu16 %ymm14, %ymm4 {%k4}
8503; AVX512BW-NEXT:    vmovdqu16 %ymm13, %ymm5 {%k4}
8504; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm13 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u]
8505; AVX512BW-NEXT:    vpshufb %xmm13, %xmm16, %xmm14
8506; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm16 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u]
8507; AVX512BW-NEXT:    vpshufb %xmm16, %xmm8, %xmm8
8508; AVX512BW-NEXT:    vpor %xmm14, %xmm8, %xmm8
8509; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13]
8510; AVX512BW-NEXT:    vpshufb %ymm14, %ymm15, %ymm8 {%k5}
8511; AVX512BW-NEXT:    vpshufb %xmm13, %xmm21, %xmm13
8512; AVX512BW-NEXT:    vpshufb %xmm16, %xmm20, %xmm15
8513; AVX512BW-NEXT:    vpor %xmm13, %xmm15, %xmm13
8514; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm15 = [u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128]
8515; AVX512BW-NEXT:    vpshufb %xmm15, %xmm17, %xmm16
8516; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13]
8517; AVX512BW-NEXT:    vpshufb %xmm17, %xmm24, %xmm19
8518; AVX512BW-NEXT:    vporq %xmm16, %xmm19, %xmm16
8519; AVX512BW-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
8520; AVX512BW-NEXT:    vinserti32x4 $2, %xmm13, %zmm16, %zmm13
8521; AVX512BW-NEXT:    vmovdqu8 %zmm8, %zmm13 {%k6}
8522; AVX512BW-NEXT:    vpshufb %xmm15, %xmm18, %xmm8
8523; AVX512BW-NEXT:    vpshufb %xmm17, %xmm27, %xmm15
8524; AVX512BW-NEXT:    vpor %xmm8, %xmm15, %xmm8
8525; AVX512BW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
8526; AVX512BW-NEXT:    vpshufb %ymm14, %ymm7, %ymm8 {%k5}
8527; AVX512BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm7
8528; AVX512BW-NEXT:    vmovdqu8 %zmm7, %zmm13 {%k3}
8529; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14]
8530; AVX512BW-NEXT:    vpshufb %ymm7, %ymm5, %ymm8
8531; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u]
8532; AVX512BW-NEXT:    vmovdqu16 %ymm10, %ymm3 {%k1}
8533; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm15
8534; AVX512BW-NEXT:    vpshufb %xmm14, %xmm15, %xmm10
8535; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm16 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u]
8536; AVX512BW-NEXT:    vpshufb %xmm16, %xmm3, %xmm17
8537; AVX512BW-NEXT:    vporq %xmm10, %xmm17, %xmm10
8538; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm8[5,6,7]
8539; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7]
8540; AVX512BW-NEXT:    vmovdqu16 %ymm23, %ymm0 {%k1}
8541; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm8
8542; AVX512BW-NEXT:    vpshufb %xmm14, %xmm8, %xmm14
8543; AVX512BW-NEXT:    vpshufb %xmm16, %xmm0, %xmm16
8544; AVX512BW-NEXT:    vporq %xmm14, %xmm16, %xmm14
8545; AVX512BW-NEXT:    vmovdqu16 %ymm26, %ymm1 {%k2}
8546; AVX512BW-NEXT:    vextracti32x4 $1, %ymm1, %xmm16
8547; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14]
8548; AVX512BW-NEXT:    vpshufb %xmm17, %xmm16, %xmm18
8549; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128]
8550; AVX512BW-NEXT:    vpshufb %xmm19, %xmm1, %xmm20
8551; AVX512BW-NEXT:    vporq %xmm18, %xmm20, %xmm18
8552; AVX512BW-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm18
8553; AVX512BW-NEXT:    vinserti32x4 $2, %xmm14, %zmm18, %zmm14
8554; AVX512BW-NEXT:    movabsq $4398044413952, %rdi # imm = 0x3FFFFE00000
8555; AVX512BW-NEXT:    kmovq %rdi, %k1
8556; AVX512BW-NEXT:    vmovdqu8 %zmm14, %zmm10 {%k1}
8557; AVX512BW-NEXT:    vpshufb %ymm7, %ymm4, %ymm7
8558; AVX512BW-NEXT:    vmovdqu16 %ymm11, %ymm6 {%k2}
8559; AVX512BW-NEXT:    vextracti128 $1, %ymm6, %xmm11
8560; AVX512BW-NEXT:    vpshufb %xmm17, %xmm11, %xmm14
8561; AVX512BW-NEXT:    vpshufb %xmm19, %xmm6, %xmm17
8562; AVX512BW-NEXT:    vporq %xmm14, %xmm17, %xmm14
8563; AVX512BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
8564; AVX512BW-NEXT:    movl $-2097152, %edi # imm = 0xFFE00000
8565; AVX512BW-NEXT:    kmovd %edi, %k2
8566; AVX512BW-NEXT:    vmovdqu8 %ymm14, %ymm7 {%k2}
8567; AVX512BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
8568; AVX512BW-NEXT:    vmovdqu16 %zmm7, %zmm10 {%k2}
8569; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15]
8570; AVX512BW-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
8571; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u]
8572; AVX512BW-NEXT:    vpshufb %xmm14, %xmm15, %xmm15
8573; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u]
8574; AVX512BW-NEXT:    vpshufb %xmm17, %xmm3, %xmm3
8575; AVX512BW-NEXT:    vpor %xmm3, %xmm15, %xmm3
8576; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
8577; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7]
8578; AVX512BW-NEXT:    vpshufb %xmm14, %xmm8, %xmm5
8579; AVX512BW-NEXT:    vpshufb %xmm17, %xmm0, %xmm0
8580; AVX512BW-NEXT:    vpor %xmm5, %xmm0, %xmm0
8581; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15]
8582; AVX512BW-NEXT:    vpshufb %xmm5, %xmm16, %xmm8
8583; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm14 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128]
8584; AVX512BW-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
8585; AVX512BW-NEXT:    vpor %xmm1, %xmm8, %xmm1
8586; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
8587; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
8588; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm3 {%k1}
8589; AVX512BW-NEXT:    vpshufb %ymm7, %ymm4, %ymm0
8590; AVX512BW-NEXT:    vpshufb %xmm5, %xmm11, %xmm1
8591; AVX512BW-NEXT:    vpshufb %xmm14, %xmm6, %xmm4
8592; AVX512BW-NEXT:    vpor %xmm1, %xmm4, %xmm1
8593; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
8594; AVX512BW-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k2}
8595; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
8596; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm3 {%k2}
8597; AVX512BW-NEXT:    vmovdqa64 %zmm2, (%rsi)
8598; AVX512BW-NEXT:    vmovdqa64 %zmm9, (%rdx)
8599; AVX512BW-NEXT:    vmovdqa64 %zmm12, (%rcx)
8600; AVX512BW-NEXT:    vmovdqa64 %zmm13, (%r8)
8601; AVX512BW-NEXT:    vmovdqa64 %zmm10, (%r9)
8602; AVX512BW-NEXT:    vmovdqa64 %zmm3, (%rax)
8603; AVX512BW-NEXT:    vzeroupper
8604; AVX512BW-NEXT:    retq
8605;
8606; AVX512BW-FCP-LABEL: load_i8_stride6_vf64:
8607; AVX512BW-FCP:       # %bb.0:
8608; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
8609; AVX512BW-FCP-NEXT:    vmovdqa 224(%rdi), %ymm0
8610; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm23
8611; AVX512BW-FCP-NEXT:    movw $18724, %r10w # imm = 0x4924
8612; AVX512BW-FCP-NEXT:    kmovd %r10d, %k1
8613; AVX512BW-FCP-NEXT:    vpblendmw %ymm0, %ymm23, %ymm9 {%k1}
8614; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
8615; AVX512BW-FCP-NEXT:    vpshufb %xmm2, %xmm9, %xmm1
8616; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
8617; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm12
8618; AVX512BW-FCP-NEXT:    vpshufb %xmm4, %xmm12, %xmm3
8619; AVX512BW-FCP-NEXT:    vpor %xmm1, %xmm3, %xmm5
8620; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm10
8621; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
8622; AVX512BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm6
8623; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm26
8624; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
8625; AVX512BW-FCP-NEXT:    vpblendmw %ymm26, %ymm1, %ymm15 {%k1}
8626; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm15, %xmm16
8627; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10]
8628; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm16, %xmm11
8629; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm18 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128]
8630; AVX512BW-FCP-NEXT:    vpshufb %xmm18, %xmm15, %xmm13
8631; AVX512BW-FCP-NEXT:    vpor %xmm11, %xmm13, %xmm11
8632; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
8633; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm11, %zmm11
8634; AVX512BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],mem[2,3]
8635; AVX512BW-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm6, %ymm13
8636; AVX512BW-FCP-NEXT:    movw $-28124, %r10w # imm = 0x9224
8637; AVX512BW-FCP-NEXT:    kmovd %r10d, %k4
8638; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm13, %ymm19 {%k4}
8639; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm20 {%k1}
8640; AVX512BW-FCP-NEXT:    vpshufb %xmm2, %xmm20, %xmm2
8641; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm20, %xmm21
8642; AVX512BW-FCP-NEXT:    vpshufb %xmm4, %xmm21, %xmm4
8643; AVX512BW-FCP-NEXT:    vpor %xmm2, %xmm4, %xmm2
8644; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10]
8645; AVX512BW-FCP-NEXT:    movl $4192256, %r10d # imm = 0x3FF800
8646; AVX512BW-FCP-NEXT:    kmovd %r10d, %k2
8647; AVX512BW-FCP-NEXT:    vpshufb %ymm6, %ymm19, %ymm2 {%k2}
8648; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm11, %zmm2 {%k2}
8649; AVX512BW-FCP-NEXT:    vmovdqa 256(%rdi), %ymm11
8650; AVX512BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm11[2,3],mem[2,3]
8651; AVX512BW-FCP-NEXT:    vinserti128 $1, 288(%rdi), %ymm11, %ymm14
8652; AVX512BW-FCP-NEXT:    vpblendmw %ymm4, %ymm14, %ymm22 {%k4}
8653; AVX512BW-FCP-NEXT:    vpshufb %ymm6, %ymm22, %ymm7
8654; AVX512BW-FCP-NEXT:    vmovdqa 320(%rdi), %ymm11
8655; AVX512BW-FCP-NEXT:    vmovdqa 352(%rdi), %ymm6
8656; AVX512BW-FCP-NEXT:    vpblendmw %ymm11, %ymm6, %ymm24 {%k1}
8657; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm24, %xmm25
8658; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm25, %xmm17
8659; AVX512BW-FCP-NEXT:    vpshufb %xmm18, %xmm24, %xmm18
8660; AVX512BW-FCP-NEXT:    vporq %xmm17, %xmm18, %xmm17
8661; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm8
8662; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15]
8663; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
8664; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
8665; AVX512BW-FCP-NEXT:    movabsq $-8796093022208, %rdi # imm = 0xFFFFF80000000000
8666; AVX512BW-FCP-NEXT:    kmovq %rdi, %k3
8667; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm7, %zmm2 {%k3}
8668; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u]
8669; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm8
8670; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u]
8671; AVX512BW-FCP-NEXT:    vpshufb %xmm9, %xmm12, %xmm12
8672; AVX512BW-FCP-NEXT:    vpor %xmm8, %xmm12, %xmm8
8673; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11]
8674; AVX512BW-FCP-NEXT:    vpshufb %xmm12, %xmm16, %xmm16
8675; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128]
8676; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm15, %xmm15
8677; AVX512BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
8678; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
8679; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm8, %zmm15, %zmm8
8680; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm20, %xmm7
8681; AVX512BW-FCP-NEXT:    vpshufb %xmm9, %xmm21, %xmm9
8682; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm9, %xmm9
8683; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11]
8684; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm19, %ymm9 {%k2}
8685; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm8, %zmm9 {%k2}
8686; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm22, %ymm7
8687; AVX512BW-FCP-NEXT:    vpshufb %xmm12, %xmm25, %xmm8
8688; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm24, %xmm12
8689; AVX512BW-FCP-NEXT:    vpor %xmm8, %xmm12, %xmm8
8690; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
8691; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15]
8692; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
8693; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
8694; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm7, %zmm9 {%k3}
8695; AVX512BW-FCP-NEXT:    vpblendmw %ymm13, %ymm5, %ymm15 {%k4}
8696; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u]
8697; AVX512BW-FCP-NEXT:    movw $9362, %di # imm = 0x2492
8698; AVX512BW-FCP-NEXT:    kmovd %edi, %k2
8699; AVX512BW-FCP-NEXT:    vpblendmw %ymm10, %ymm3, %ymm8 {%k2}
8700; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm8, %xmm16
8701; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm16, %xmm12
8702; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u]
8703; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm8, %xmm18
8704; AVX512BW-FCP-NEXT:    vporq %xmm12, %xmm18, %xmm18
8705; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm19 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12]
8706; AVX512BW-FCP-NEXT:    movl $2095104, %edi # imm = 0x1FF800
8707; AVX512BW-FCP-NEXT:    kmovd %edi, %k5
8708; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm15, %ymm18 {%k5}
8709; AVX512BW-FCP-NEXT:    vpblendmw %ymm23, %ymm0, %ymm20 {%k2}
8710; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm20, %xmm21
8711; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm21, %xmm7
8712; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm20, %xmm12
8713; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm12, %xmm7
8714; AVX512BW-FCP-NEXT:    vpblendmw %ymm1, %ymm26, %ymm17 {%k1}
8715; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm22 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128]
8716; AVX512BW-FCP-NEXT:    vpshufb %xmm22, %xmm17, %xmm12
8717; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm24
8718; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm25 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12]
8719; AVX512BW-FCP-NEXT:    vpshufb %xmm25, %xmm24, %xmm27
8720; AVX512BW-FCP-NEXT:    vporq %xmm12, %xmm27, %xmm12
8721; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
8722; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm7, %zmm12, %zmm12
8723; AVX512BW-FCP-NEXT:    movl $2097151, %edi # imm = 0x1FFFFF
8724; AVX512BW-FCP-NEXT:    kmovq %rdi, %k6
8725; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm18, %zmm12 {%k6}
8726; AVX512BW-FCP-NEXT:    vpblendmw %ymm14, %ymm4, %ymm7 {%k4}
8727; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm11, %ymm18 {%k1}
8728; AVX512BW-FCP-NEXT:    vpshufb %xmm22, %xmm18, %xmm22
8729; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm27
8730; AVX512BW-FCP-NEXT:    vpshufb %xmm25, %xmm27, %xmm25
8731; AVX512BW-FCP-NEXT:    vporq %xmm22, %xmm25, %xmm22
8732; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm22
8733; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm7, %ymm22 {%k5}
8734; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm0, %zmm19
8735; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm19, %zmm12 {%k3}
8736; AVX512BW-FCP-NEXT:    movw $9289, %di # imm = 0x2449
8737; AVX512BW-FCP-NEXT:    kmovd %edi, %k4
8738; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm14, %ymm4 {%k4}
8739; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm13, %ymm5 {%k4}
8740; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u]
8741; AVX512BW-FCP-NEXT:    vpshufb %xmm13, %xmm16, %xmm14
8742; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm16 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u]
8743; AVX512BW-FCP-NEXT:    vpshufb %xmm16, %xmm8, %xmm8
8744; AVX512BW-FCP-NEXT:    vpor %xmm14, %xmm8, %xmm8
8745; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13]
8746; AVX512BW-FCP-NEXT:    vpshufb %ymm14, %ymm15, %ymm8 {%k5}
8747; AVX512BW-FCP-NEXT:    vpshufb %xmm13, %xmm21, %xmm13
8748; AVX512BW-FCP-NEXT:    vpshufb %xmm16, %xmm20, %xmm15
8749; AVX512BW-FCP-NEXT:    vpor %xmm13, %xmm15, %xmm13
8750; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128]
8751; AVX512BW-FCP-NEXT:    vpshufb %xmm15, %xmm17, %xmm16
8752; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13]
8753; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm24, %xmm19
8754; AVX512BW-FCP-NEXT:    vporq %xmm16, %xmm19, %xmm16
8755; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
8756; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm13, %zmm16, %zmm13
8757; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm8, %zmm13 {%k6}
8758; AVX512BW-FCP-NEXT:    vpshufb %xmm15, %xmm18, %xmm8
8759; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm27, %xmm15
8760; AVX512BW-FCP-NEXT:    vpor %xmm8, %xmm15, %xmm8
8761; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
8762; AVX512BW-FCP-NEXT:    vpshufb %ymm14, %ymm7, %ymm8 {%k5}
8763; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm7
8764; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm7, %zmm13 {%k3}
8765; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14]
8766; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm8
8767; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u]
8768; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm10, %ymm3 {%k1}
8769; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm15
8770; AVX512BW-FCP-NEXT:    vpshufb %xmm14, %xmm15, %xmm10
8771; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm16 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u]
8772; AVX512BW-FCP-NEXT:    vpshufb %xmm16, %xmm3, %xmm17
8773; AVX512BW-FCP-NEXT:    vporq %xmm10, %xmm17, %xmm10
8774; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm8[5,6,7]
8775; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7]
8776; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm23, %ymm0 {%k1}
8777; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm8
8778; AVX512BW-FCP-NEXT:    vpshufb %xmm14, %xmm8, %xmm14
8779; AVX512BW-FCP-NEXT:    vpshufb %xmm16, %xmm0, %xmm16
8780; AVX512BW-FCP-NEXT:    vporq %xmm14, %xmm16, %xmm14
8781; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm26, %ymm1 {%k2}
8782; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm1, %xmm16
8783; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14]
8784; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm16, %xmm18
8785; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128]
8786; AVX512BW-FCP-NEXT:    vpshufb %xmm19, %xmm1, %xmm20
8787; AVX512BW-FCP-NEXT:    vporq %xmm18, %xmm20, %xmm18
8788; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm18
8789; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm14, %zmm18, %zmm14
8790; AVX512BW-FCP-NEXT:    movabsq $4398044413952, %rdi # imm = 0x3FFFFE00000
8791; AVX512BW-FCP-NEXT:    kmovq %rdi, %k1
8792; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm10 {%k1}
8793; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm4, %ymm7
8794; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm11, %ymm6 {%k2}
8795; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm11
8796; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm11, %xmm14
8797; AVX512BW-FCP-NEXT:    vpshufb %xmm19, %xmm6, %xmm17
8798; AVX512BW-FCP-NEXT:    vporq %xmm14, %xmm17, %xmm14
8799; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
8800; AVX512BW-FCP-NEXT:    movl $-2097152, %edi # imm = 0xFFE00000
8801; AVX512BW-FCP-NEXT:    kmovd %edi, %k2
8802; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm14, %ymm7 {%k2}
8803; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
8804; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm7, %zmm10 {%k2}
8805; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15]
8806; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
8807; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u]
8808; AVX512BW-FCP-NEXT:    vpshufb %xmm14, %xmm15, %xmm15
8809; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u]
8810; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm3, %xmm3
8811; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm15, %xmm3
8812; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
8813; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7]
8814; AVX512BW-FCP-NEXT:    vpshufb %xmm14, %xmm8, %xmm5
8815; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm0, %xmm0
8816; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm0, %xmm0
8817; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15]
8818; AVX512BW-FCP-NEXT:    vpshufb %xmm5, %xmm16, %xmm8
8819; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128]
8820; AVX512BW-FCP-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
8821; AVX512BW-FCP-NEXT:    vpor %xmm1, %xmm8, %xmm1
8822; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
8823; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
8824; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm3 {%k1}
8825; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm4, %ymm0
8826; AVX512BW-FCP-NEXT:    vpshufb %xmm5, %xmm11, %xmm1
8827; AVX512BW-FCP-NEXT:    vpshufb %xmm14, %xmm6, %xmm4
8828; AVX512BW-FCP-NEXT:    vpor %xmm1, %xmm4, %xmm1
8829; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
8830; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k2}
8831; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
8832; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm0, %zmm3 {%k2}
8833; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, (%rsi)
8834; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, (%rdx)
8835; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, (%rcx)
8836; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, (%r8)
8837; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, (%r9)
8838; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
8839; AVX512BW-FCP-NEXT:    vzeroupper
8840; AVX512BW-FCP-NEXT:    retq
8841;
8842; AVX512DQ-BW-LABEL: load_i8_stride6_vf64:
8843; AVX512DQ-BW:       # %bb.0:
8844; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
8845; AVX512DQ-BW-NEXT:    vmovdqa 224(%rdi), %ymm0
8846; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %ymm23
8847; AVX512DQ-BW-NEXT:    movw $18724, %r10w # imm = 0x4924
8848; AVX512DQ-BW-NEXT:    kmovd %r10d, %k1
8849; AVX512DQ-BW-NEXT:    vpblendmw %ymm0, %ymm23, %ymm9 {%k1}
8850; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
8851; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm9, %xmm1
8852; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
8853; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm9, %xmm12
8854; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm12, %xmm3
8855; AVX512DQ-BW-NEXT:    vpor %xmm1, %xmm3, %xmm5
8856; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm10
8857; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm3
8858; AVX512DQ-BW-NEXT:    vmovdqa 64(%rdi), %ymm6
8859; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %ymm26
8860; AVX512DQ-BW-NEXT:    vmovdqa 160(%rdi), %ymm1
8861; AVX512DQ-BW-NEXT:    vpblendmw %ymm26, %ymm1, %ymm15 {%k1}
8862; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm15, %xmm16
8863; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10]
8864; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm16, %xmm11
8865; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm18 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128]
8866; AVX512DQ-BW-NEXT:    vpshufb %xmm18, %xmm15, %xmm13
8867; AVX512DQ-BW-NEXT:    vpor %xmm11, %xmm13, %xmm11
8868; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
8869; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm5, %zmm11, %zmm11
8870; AVX512DQ-BW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],mem[2,3]
8871; AVX512DQ-BW-NEXT:    vinserti128 $1, 96(%rdi), %ymm6, %ymm13
8872; AVX512DQ-BW-NEXT:    movw $-28124, %r10w # imm = 0x9224
8873; AVX512DQ-BW-NEXT:    kmovd %r10d, %k4
8874; AVX512DQ-BW-NEXT:    vpblendmw %ymm5, %ymm13, %ymm19 {%k4}
8875; AVX512DQ-BW-NEXT:    vpblendmw %ymm3, %ymm10, %ymm20 {%k1}
8876; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm20, %xmm2
8877; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm20, %xmm21
8878; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm21, %xmm4
8879; AVX512DQ-BW-NEXT:    vpor %xmm2, %xmm4, %xmm2
8880; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10]
8881; AVX512DQ-BW-NEXT:    movl $4192256, %r10d # imm = 0x3FF800
8882; AVX512DQ-BW-NEXT:    kmovd %r10d, %k2
8883; AVX512DQ-BW-NEXT:    vpshufb %ymm6, %ymm19, %ymm2 {%k2}
8884; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm11, %zmm2 {%k2}
8885; AVX512DQ-BW-NEXT:    vmovdqa 256(%rdi), %ymm11
8886; AVX512DQ-BW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm11[2,3],mem[2,3]
8887; AVX512DQ-BW-NEXT:    vinserti128 $1, 288(%rdi), %ymm11, %ymm14
8888; AVX512DQ-BW-NEXT:    vpblendmw %ymm4, %ymm14, %ymm22 {%k4}
8889; AVX512DQ-BW-NEXT:    vpshufb %ymm6, %ymm22, %ymm7
8890; AVX512DQ-BW-NEXT:    vmovdqa 320(%rdi), %ymm11
8891; AVX512DQ-BW-NEXT:    vmovdqa 352(%rdi), %ymm6
8892; AVX512DQ-BW-NEXT:    vpblendmw %ymm11, %ymm6, %ymm24 {%k1}
8893; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm24, %xmm25
8894; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm25, %xmm17
8895; AVX512DQ-BW-NEXT:    vpshufb %xmm18, %xmm24, %xmm18
8896; AVX512DQ-BW-NEXT:    vporq %xmm17, %xmm18, %xmm17
8897; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm8
8898; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15]
8899; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
8900; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
8901; AVX512DQ-BW-NEXT:    movabsq $-8796093022208, %rdi # imm = 0xFFFFF80000000000
8902; AVX512DQ-BW-NEXT:    kmovq %rdi, %k3
8903; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm7, %zmm2 {%k3}
8904; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm7 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u]
8905; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm9, %xmm8
8906; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm9 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u]
8907; AVX512DQ-BW-NEXT:    vpshufb %xmm9, %xmm12, %xmm12
8908; AVX512DQ-BW-NEXT:    vpor %xmm8, %xmm12, %xmm8
8909; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11]
8910; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm16, %xmm16
8911; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128]
8912; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm15, %xmm15
8913; AVX512DQ-BW-NEXT:    vporq %xmm16, %xmm15, %xmm15
8914; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
8915; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm8, %zmm15, %zmm8
8916; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm20, %xmm7
8917; AVX512DQ-BW-NEXT:    vpshufb %xmm9, %xmm21, %xmm9
8918; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm9, %xmm9
8919; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11]
8920; AVX512DQ-BW-NEXT:    vpshufb %ymm7, %ymm19, %ymm9 {%k2}
8921; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm8, %zmm9 {%k2}
8922; AVX512DQ-BW-NEXT:    vpshufb %ymm7, %ymm22, %ymm7
8923; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm25, %xmm8
8924; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm24, %xmm12
8925; AVX512DQ-BW-NEXT:    vpor %xmm8, %xmm12, %xmm8
8926; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
8927; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15]
8928; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
8929; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
8930; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm7, %zmm9 {%k3}
8931; AVX512DQ-BW-NEXT:    vpblendmw %ymm13, %ymm5, %ymm15 {%k4}
8932; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u]
8933; AVX512DQ-BW-NEXT:    movw $9362, %di # imm = 0x2492
8934; AVX512DQ-BW-NEXT:    kmovd %edi, %k2
8935; AVX512DQ-BW-NEXT:    vpblendmw %ymm10, %ymm3, %ymm8 {%k2}
8936; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm8, %xmm16
8937; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm16, %xmm12
8938; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u]
8939; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm8, %xmm18
8940; AVX512DQ-BW-NEXT:    vporq %xmm12, %xmm18, %xmm18
8941; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm19 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12]
8942; AVX512DQ-BW-NEXT:    movl $2095104, %edi # imm = 0x1FF800
8943; AVX512DQ-BW-NEXT:    kmovd %edi, %k5
8944; AVX512DQ-BW-NEXT:    vpshufb %ymm19, %ymm15, %ymm18 {%k5}
8945; AVX512DQ-BW-NEXT:    vpblendmw %ymm23, %ymm0, %ymm20 {%k2}
8946; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm20, %xmm21
8947; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm21, %xmm7
8948; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm20, %xmm12
8949; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm12, %xmm7
8950; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm26, %ymm17 {%k1}
8951; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm22 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128]
8952; AVX512DQ-BW-NEXT:    vpshufb %xmm22, %xmm17, %xmm12
8953; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm17, %xmm24
8954; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm25 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12]
8955; AVX512DQ-BW-NEXT:    vpshufb %xmm25, %xmm24, %xmm27
8956; AVX512DQ-BW-NEXT:    vporq %xmm12, %xmm27, %xmm12
8957; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
8958; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm7, %zmm12, %zmm12
8959; AVX512DQ-BW-NEXT:    movl $2097151, %edi # imm = 0x1FFFFF
8960; AVX512DQ-BW-NEXT:    kmovq %rdi, %k6
8961; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm18, %zmm12 {%k6}
8962; AVX512DQ-BW-NEXT:    vpblendmw %ymm14, %ymm4, %ymm7 {%k4}
8963; AVX512DQ-BW-NEXT:    vpblendmw %ymm6, %ymm11, %ymm18 {%k1}
8964; AVX512DQ-BW-NEXT:    vpshufb %xmm22, %xmm18, %xmm22
8965; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm18, %xmm27
8966; AVX512DQ-BW-NEXT:    vpshufb %xmm25, %xmm27, %xmm25
8967; AVX512DQ-BW-NEXT:    vporq %xmm22, %xmm25, %xmm22
8968; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm22
8969; AVX512DQ-BW-NEXT:    vpshufb %ymm19, %ymm7, %ymm22 {%k5}
8970; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm22, %zmm0, %zmm19
8971; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm19, %zmm12 {%k3}
8972; AVX512DQ-BW-NEXT:    movw $9289, %di # imm = 0x2449
8973; AVX512DQ-BW-NEXT:    kmovd %edi, %k4
8974; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm14, %ymm4 {%k4}
8975; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm13, %ymm5 {%k4}
8976; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm13 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u]
8977; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm16, %xmm14
8978; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm16 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u]
8979; AVX512DQ-BW-NEXT:    vpshufb %xmm16, %xmm8, %xmm8
8980; AVX512DQ-BW-NEXT:    vpor %xmm14, %xmm8, %xmm8
8981; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13]
8982; AVX512DQ-BW-NEXT:    vpshufb %ymm14, %ymm15, %ymm8 {%k5}
8983; AVX512DQ-BW-NEXT:    vpshufb %xmm13, %xmm21, %xmm13
8984; AVX512DQ-BW-NEXT:    vpshufb %xmm16, %xmm20, %xmm15
8985; AVX512DQ-BW-NEXT:    vpor %xmm13, %xmm15, %xmm13
8986; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm15 = [u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128]
8987; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm17, %xmm16
8988; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13]
8989; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm24, %xmm19
8990; AVX512DQ-BW-NEXT:    vporq %xmm16, %xmm19, %xmm16
8991; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
8992; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm13, %zmm16, %zmm13
8993; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm8, %zmm13 {%k6}
8994; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm18, %xmm8
8995; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm27, %xmm15
8996; AVX512DQ-BW-NEXT:    vpor %xmm8, %xmm15, %xmm8
8997; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
8998; AVX512DQ-BW-NEXT:    vpshufb %ymm14, %ymm7, %ymm8 {%k5}
8999; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm7
9000; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm7, %zmm13 {%k3}
9001; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14]
9002; AVX512DQ-BW-NEXT:    vpshufb %ymm7, %ymm5, %ymm8
9003; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u]
9004; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm10, %ymm3 {%k1}
9005; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm15
9006; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm15, %xmm10
9007; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm16 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u]
9008; AVX512DQ-BW-NEXT:    vpshufb %xmm16, %xmm3, %xmm17
9009; AVX512DQ-BW-NEXT:    vporq %xmm10, %xmm17, %xmm10
9010; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm8[5,6,7]
9011; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7]
9012; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm23, %ymm0 {%k1}
9013; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm0, %xmm8
9014; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm8, %xmm14
9015; AVX512DQ-BW-NEXT:    vpshufb %xmm16, %xmm0, %xmm16
9016; AVX512DQ-BW-NEXT:    vporq %xmm14, %xmm16, %xmm14
9017; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm26, %ymm1 {%k2}
9018; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm1, %xmm16
9019; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14]
9020; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm16, %xmm18
9021; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128]
9022; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm1, %xmm20
9023; AVX512DQ-BW-NEXT:    vporq %xmm18, %xmm20, %xmm18
9024; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm18
9025; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm14, %zmm18, %zmm14
9026; AVX512DQ-BW-NEXT:    movabsq $4398044413952, %rdi # imm = 0x3FFFFE00000
9027; AVX512DQ-BW-NEXT:    kmovq %rdi, %k1
9028; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm14, %zmm10 {%k1}
9029; AVX512DQ-BW-NEXT:    vpshufb %ymm7, %ymm4, %ymm7
9030; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm11, %ymm6 {%k2}
9031; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm6, %xmm11
9032; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm11, %xmm14
9033; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm6, %xmm17
9034; AVX512DQ-BW-NEXT:    vporq %xmm14, %xmm17, %xmm14
9035; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
9036; AVX512DQ-BW-NEXT:    movl $-2097152, %edi # imm = 0xFFE00000
9037; AVX512DQ-BW-NEXT:    kmovd %edi, %k2
9038; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm14, %ymm7 {%k2}
9039; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
9040; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm7, %zmm10 {%k2}
9041; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15]
9042; AVX512DQ-BW-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
9043; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u]
9044; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm15, %xmm15
9045; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u]
9046; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm3, %xmm3
9047; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm15, %xmm3
9048; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
9049; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7]
9050; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm8, %xmm5
9051; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm0, %xmm0
9052; AVX512DQ-BW-NEXT:    vpor %xmm5, %xmm0, %xmm0
9053; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15]
9054; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm16, %xmm8
9055; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm14 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128]
9056; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
9057; AVX512DQ-BW-NEXT:    vpor %xmm1, %xmm8, %xmm1
9058; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
9059; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
9060; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm3 {%k1}
9061; AVX512DQ-BW-NEXT:    vpshufb %ymm7, %ymm4, %ymm0
9062; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm11, %xmm1
9063; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm6, %xmm4
9064; AVX512DQ-BW-NEXT:    vpor %xmm1, %xmm4, %xmm1
9065; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
9066; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k2}
9067; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
9068; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm0, %zmm3 {%k2}
9069; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, (%rsi)
9070; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, (%rdx)
9071; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, (%rcx)
9072; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm13, (%r8)
9073; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, (%r9)
9074; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, (%rax)
9075; AVX512DQ-BW-NEXT:    vzeroupper
9076; AVX512DQ-BW-NEXT:    retq
9077;
9078; AVX512DQ-BW-FCP-LABEL: load_i8_stride6_vf64:
9079; AVX512DQ-BW-FCP:       # %bb.0:
9080; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
9081; AVX512DQ-BW-FCP-NEXT:    vmovdqa 224(%rdi), %ymm0
9082; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm23
9083; AVX512DQ-BW-FCP-NEXT:    movw $18724, %r10w # imm = 0x4924
9084; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k1
9085; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm0, %ymm23, %ymm9 {%k1}
9086; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
9087; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm2, %xmm9, %xmm1
9088; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
9089; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm12
9090; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm4, %xmm12, %xmm3
9091; AVX512DQ-BW-FCP-NEXT:    vpor %xmm1, %xmm3, %xmm5
9092; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm10
9093; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
9094; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm6
9095; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm26
9096; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
9097; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm26, %ymm1, %ymm15 {%k1}
9098; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm15, %xmm16
9099; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10]
9100; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm16, %xmm11
9101; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm18 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128]
9102; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm18, %xmm15, %xmm13
9103; AVX512DQ-BW-FCP-NEXT:    vpor %xmm11, %xmm13, %xmm11
9104; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
9105; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm11, %zmm11
9106; AVX512DQ-BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],mem[2,3]
9107; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 96(%rdi), %ymm6, %ymm13
9108; AVX512DQ-BW-FCP-NEXT:    movw $-28124, %r10w # imm = 0x9224
9109; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k4
9110; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm13, %ymm19 {%k4}
9111; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm20 {%k1}
9112; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm2, %xmm20, %xmm2
9113; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm20, %xmm21
9114; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm4, %xmm21, %xmm4
9115; AVX512DQ-BW-FCP-NEXT:    vpor %xmm2, %xmm4, %xmm2
9116; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10]
9117; AVX512DQ-BW-FCP-NEXT:    movl $4192256, %r10d # imm = 0x3FF800
9118; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k2
9119; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm6, %ymm19, %ymm2 {%k2}
9120; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm11, %zmm2 {%k2}
9121; AVX512DQ-BW-FCP-NEXT:    vmovdqa 256(%rdi), %ymm11
9122; AVX512DQ-BW-FCP-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm11[2,3],mem[2,3]
9123; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, 288(%rdi), %ymm11, %ymm14
9124; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm4, %ymm14, %ymm22 {%k4}
9125; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm6, %ymm22, %ymm7
9126; AVX512DQ-BW-FCP-NEXT:    vmovdqa 320(%rdi), %ymm11
9127; AVX512DQ-BW-FCP-NEXT:    vmovdqa 352(%rdi), %ymm6
9128; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm11, %ymm6, %ymm24 {%k1}
9129; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm24, %xmm25
9130; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm25, %xmm17
9131; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm18, %xmm24, %xmm18
9132; AVX512DQ-BW-FCP-NEXT:    vporq %xmm17, %xmm18, %xmm17
9133; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm8
9134; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15]
9135; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
9136; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
9137; AVX512DQ-BW-FCP-NEXT:    movabsq $-8796093022208, %rdi # imm = 0xFFFFF80000000000
9138; AVX512DQ-BW-FCP-NEXT:    kmovq %rdi, %k3
9139; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm7, %zmm2 {%k3}
9140; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u]
9141; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm8
9142; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u]
9143; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm9, %xmm12, %xmm12
9144; AVX512DQ-BW-FCP-NEXT:    vpor %xmm8, %xmm12, %xmm8
9145; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11]
9146; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm12, %xmm16, %xmm16
9147; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128]
9148; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm15, %xmm15
9149; AVX512DQ-BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
9150; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
9151; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm8, %zmm15, %zmm8
9152; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm20, %xmm7
9153; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm9, %xmm21, %xmm9
9154; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm9, %xmm9
9155; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11]
9156; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm19, %ymm9 {%k2}
9157; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm8, %zmm9 {%k2}
9158; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm22, %ymm7
9159; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm12, %xmm25, %xmm8
9160; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm24, %xmm12
9161; AVX512DQ-BW-FCP-NEXT:    vpor %xmm8, %xmm12, %xmm8
9162; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
9163; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15]
9164; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
9165; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
9166; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm7, %zmm9 {%k3}
9167; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm13, %ymm5, %ymm15 {%k4}
9168; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u]
9169; AVX512DQ-BW-FCP-NEXT:    movw $9362, %di # imm = 0x2492
9170; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k2
9171; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm10, %ymm3, %ymm8 {%k2}
9172; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm8, %xmm16
9173; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm16, %xmm12
9174; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u]
9175; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm8, %xmm18
9176; AVX512DQ-BW-FCP-NEXT:    vporq %xmm12, %xmm18, %xmm18
9177; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm19 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12]
9178; AVX512DQ-BW-FCP-NEXT:    movl $2095104, %edi # imm = 0x1FF800
9179; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k5
9180; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm15, %ymm18 {%k5}
9181; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm23, %ymm0, %ymm20 {%k2}
9182; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm20, %xmm21
9183; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm21, %xmm7
9184; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm20, %xmm12
9185; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm12, %xmm7
9186; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm1, %ymm26, %ymm17 {%k1}
9187; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm22 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128]
9188; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm22, %xmm17, %xmm12
9189; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm24
9190; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm25 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12]
9191; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm25, %xmm24, %xmm27
9192; AVX512DQ-BW-FCP-NEXT:    vporq %xmm12, %xmm27, %xmm12
9193; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
9194; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm7, %zmm12, %zmm12
9195; AVX512DQ-BW-FCP-NEXT:    movl $2097151, %edi # imm = 0x1FFFFF
9196; AVX512DQ-BW-FCP-NEXT:    kmovq %rdi, %k6
9197; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm18, %zmm12 {%k6}
9198; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm14, %ymm4, %ymm7 {%k4}
9199; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm11, %ymm18 {%k1}
9200; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm22, %xmm18, %xmm22
9201; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm27
9202; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm25, %xmm27, %xmm25
9203; AVX512DQ-BW-FCP-NEXT:    vporq %xmm22, %xmm25, %xmm22
9204; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm22
9205; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm7, %ymm22 {%k5}
9206; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm0, %zmm19
9207; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm19, %zmm12 {%k3}
9208; AVX512DQ-BW-FCP-NEXT:    movw $9289, %di # imm = 0x2449
9209; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k4
9210; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm14, %ymm4 {%k4}
9211; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm13, %ymm5 {%k4}
9212; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u]
9213; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm13, %xmm16, %xmm14
9214; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm16 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u]
9215; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm16, %xmm8, %xmm8
9216; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm8, %xmm8
9217; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13]
9218; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm14, %ymm15, %ymm8 {%k5}
9219; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm13, %xmm21, %xmm13
9220; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm16, %xmm20, %xmm15
9221; AVX512DQ-BW-FCP-NEXT:    vpor %xmm13, %xmm15, %xmm13
9222; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128]
9223; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm15, %xmm17, %xmm16
9224; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13]
9225; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm24, %xmm19
9226; AVX512DQ-BW-FCP-NEXT:    vporq %xmm16, %xmm19, %xmm16
9227; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
9228; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm13, %zmm16, %zmm13
9229; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm8, %zmm13 {%k6}
9230; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm15, %xmm18, %xmm8
9231; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm27, %xmm15
9232; AVX512DQ-BW-FCP-NEXT:    vpor %xmm8, %xmm15, %xmm8
9233; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
9234; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm14, %ymm7, %ymm8 {%k5}
9235; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm7
9236; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm7, %zmm13 {%k3}
9237; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14]
9238; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm8
9239; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u]
9240; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm10, %ymm3 {%k1}
9241; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm15
9242; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm14, %xmm15, %xmm10
9243; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm16 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u]
9244; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm16, %xmm3, %xmm17
9245; AVX512DQ-BW-FCP-NEXT:    vporq %xmm10, %xmm17, %xmm10
9246; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm8[5,6,7]
9247; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7]
9248; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm23, %ymm0 {%k1}
9249; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm8
9250; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm14, %xmm8, %xmm14
9251; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm16, %xmm0, %xmm16
9252; AVX512DQ-BW-FCP-NEXT:    vporq %xmm14, %xmm16, %xmm14
9253; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm26, %ymm1 {%k2}
9254; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm1, %xmm16
9255; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14]
9256; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm16, %xmm18
9257; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128]
9258; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm19, %xmm1, %xmm20
9259; AVX512DQ-BW-FCP-NEXT:    vporq %xmm18, %xmm20, %xmm18
9260; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm18
9261; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm14, %zmm18, %zmm14
9262; AVX512DQ-BW-FCP-NEXT:    movabsq $4398044413952, %rdi # imm = 0x3FFFFE00000
9263; AVX512DQ-BW-FCP-NEXT:    kmovq %rdi, %k1
9264; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm10 {%k1}
9265; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm4, %ymm7
9266; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm11, %ymm6 {%k2}
9267; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm11
9268; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm11, %xmm14
9269; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm19, %xmm6, %xmm17
9270; AVX512DQ-BW-FCP-NEXT:    vporq %xmm14, %xmm17, %xmm14
9271; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
9272; AVX512DQ-BW-FCP-NEXT:    movl $-2097152, %edi # imm = 0xFFE00000
9273; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k2
9274; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm14, %ymm7 {%k2}
9275; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
9276; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm7, %zmm10 {%k2}
9277; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15]
9278; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
9279; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u]
9280; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm14, %xmm15, %xmm15
9281; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u]
9282; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm3, %xmm3
9283; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm15, %xmm3
9284; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
9285; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7]
9286; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm14, %xmm8, %xmm5
9287; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm0, %xmm0
9288; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm0, %xmm0
9289; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15]
9290; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm5, %xmm16, %xmm8
9291; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128]
9292; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
9293; AVX512DQ-BW-FCP-NEXT:    vpor %xmm1, %xmm8, %xmm1
9294; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
9295; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
9296; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm3 {%k1}
9297; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm4, %ymm0
9298; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm5, %xmm11, %xmm1
9299; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm14, %xmm6, %xmm4
9300; AVX512DQ-BW-FCP-NEXT:    vpor %xmm1, %xmm4, %xmm1
9301; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
9302; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k2}
9303; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
9304; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm0, %zmm3 {%k2}
9305; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, (%rsi)
9306; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, (%rdx)
9307; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, (%rcx)
9308; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, (%r8)
9309; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, (%r9)
9310; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
9311; AVX512DQ-BW-FCP-NEXT:    vzeroupper
9312; AVX512DQ-BW-FCP-NEXT:    retq
9313  %wide.vec = load <384 x i8>, ptr %in.vec, align 64
9314  %strided.vec0 = shufflevector <384 x i8> %wide.vec, <384 x i8> poison, <64 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90, i32 96, i32 102, i32 108, i32 114, i32 120, i32 126, i32 132, i32 138, i32 144, i32 150, i32 156, i32 162, i32 168, i32 174, i32 180, i32 186, i32 192, i32 198, i32 204, i32 210, i32 216, i32 222, i32 228, i32 234, i32 240, i32 246, i32 252, i32 258, i32 264, i32 270, i32 276, i32 282, i32 288, i32 294, i32 300, i32 306, i32 312, i32 318, i32 324, i32 330, i32 336, i32 342, i32 348, i32 354, i32 360, i32 366, i32 372, i32 378>
9315  %strided.vec1 = shufflevector <384 x i8> %wide.vec, <384 x i8> poison, <64 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43, i32 49, i32 55, i32 61, i32 67, i32 73, i32 79, i32 85, i32 91, i32 97, i32 103, i32 109, i32 115, i32 121, i32 127, i32 133, i32 139, i32 145, i32 151, i32 157, i32 163, i32 169, i32 175, i32 181, i32 187, i32 193, i32 199, i32 205, i32 211, i32 217, i32 223, i32 229, i32 235, i32 241, i32 247, i32 253, i32 259, i32 265, i32 271, i32 277, i32 283, i32 289, i32 295, i32 301, i32 307, i32 313, i32 319, i32 325, i32 331, i32 337, i32 343, i32 349, i32 355, i32 361, i32 367, i32 373, i32 379>
9316  %strided.vec2 = shufflevector <384 x i8> %wide.vec, <384 x i8> poison, <64 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44, i32 50, i32 56, i32 62, i32 68, i32 74, i32 80, i32 86, i32 92, i32 98, i32 104, i32 110, i32 116, i32 122, i32 128, i32 134, i32 140, i32 146, i32 152, i32 158, i32 164, i32 170, i32 176, i32 182, i32 188, i32 194, i32 200, i32 206, i32 212, i32 218, i32 224, i32 230, i32 236, i32 242, i32 248, i32 254, i32 260, i32 266, i32 272, i32 278, i32 284, i32 290, i32 296, i32 302, i32 308, i32 314, i32 320, i32 326, i32 332, i32 338, i32 344, i32 350, i32 356, i32 362, i32 368, i32 374, i32 380>
9317  %strided.vec3 = shufflevector <384 x i8> %wide.vec, <384 x i8> poison, <64 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45, i32 51, i32 57, i32 63, i32 69, i32 75, i32 81, i32 87, i32 93, i32 99, i32 105, i32 111, i32 117, i32 123, i32 129, i32 135, i32 141, i32 147, i32 153, i32 159, i32 165, i32 171, i32 177, i32 183, i32 189, i32 195, i32 201, i32 207, i32 213, i32 219, i32 225, i32 231, i32 237, i32 243, i32 249, i32 255, i32 261, i32 267, i32 273, i32 279, i32 285, i32 291, i32 297, i32 303, i32 309, i32 315, i32 321, i32 327, i32 333, i32 339, i32 345, i32 351, i32 357, i32 363, i32 369, i32 375, i32 381>
9318  %strided.vec4 = shufflevector <384 x i8> %wide.vec, <384 x i8> poison, <64 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46, i32 52, i32 58, i32 64, i32 70, i32 76, i32 82, i32 88, i32 94, i32 100, i32 106, i32 112, i32 118, i32 124, i32 130, i32 136, i32 142, i32 148, i32 154, i32 160, i32 166, i32 172, i32 178, i32 184, i32 190, i32 196, i32 202, i32 208, i32 214, i32 220, i32 226, i32 232, i32 238, i32 244, i32 250, i32 256, i32 262, i32 268, i32 274, i32 280, i32 286, i32 292, i32 298, i32 304, i32 310, i32 316, i32 322, i32 328, i32 334, i32 340, i32 346, i32 352, i32 358, i32 364, i32 370, i32 376, i32 382>
9319  %strided.vec5 = shufflevector <384 x i8> %wide.vec, <384 x i8> poison, <64 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47, i32 53, i32 59, i32 65, i32 71, i32 77, i32 83, i32 89, i32 95, i32 101, i32 107, i32 113, i32 119, i32 125, i32 131, i32 137, i32 143, i32 149, i32 155, i32 161, i32 167, i32 173, i32 179, i32 185, i32 191, i32 197, i32 203, i32 209, i32 215, i32 221, i32 227, i32 233, i32 239, i32 245, i32 251, i32 257, i32 263, i32 269, i32 275, i32 281, i32 287, i32 293, i32 299, i32 305, i32 311, i32 317, i32 323, i32 329, i32 335, i32 341, i32 347, i32 353, i32 359, i32 365, i32 371, i32 377, i32 383>
9320  store <64 x i8> %strided.vec0, ptr %out.vec0, align 64
9321  store <64 x i8> %strided.vec1, ptr %out.vec1, align 64
9322  store <64 x i8> %strided.vec2, ptr %out.vec2, align 64
9323  store <64 x i8> %strided.vec3, ptr %out.vec3, align 64
9324  store <64 x i8> %strided.vec4, ptr %out.vec4, align 64
9325  store <64 x i8> %strided.vec5, ptr %out.vec5, align 64
9326  ret void
9327}
9328