xref: /llvm-project/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll (revision 8ac00ca4867835cacaf013f5c442658b9b1bce38)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx  | FileCheck %s --check-prefixes=AVX
4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
15
16; These patterns are produced by LoopVectorizer for interleaved loads.
17
18define void @load_i8_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
19; SSE-LABEL: load_i8_stride7_vf2:
20; SSE:       # %bb.0:
21; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
22; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r10
23; SSE-NEXT:    movdqa (%rdi), %xmm3
24; SSE-NEXT:    pxor %xmm4, %xmm4
25; SSE-NEXT:    movdqa %xmm3, %xmm2
26; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
27; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3]
28; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
29; SSE-NEXT:    packuswb %xmm1, %xmm1
30; SSE-NEXT:    movdqa %xmm2, %xmm0
31; SSE-NEXT:    psrld $16, %xmm0
32; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
33; SSE-NEXT:    movdqa %xmm0, %xmm4
34; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
35; SSE-NEXT:    packuswb %xmm4, %xmm4
36; SSE-NEXT:    movdqa %xmm2, %xmm6
37; SSE-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
38; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[2,1,2,3]
39; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
40; SSE-NEXT:    packuswb %xmm5, %xmm5
41; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7]
42; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2]
43; SSE-NEXT:    packuswb %xmm6, %xmm6
44; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3]
45; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
46; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
47; SSE-NEXT:    psrlq $48, %xmm3
48; SSE-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
49; SSE-NEXT:    packuswb %xmm7, %xmm7
50; SSE-NEXT:    packuswb %xmm0, %xmm0
51; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
52; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
53; SSE-NEXT:    packuswb %xmm2, %xmm2
54; SSE-NEXT:    movd %xmm1, %edi
55; SSE-NEXT:    movw %di, (%rsi)
56; SSE-NEXT:    movd %xmm4, %esi
57; SSE-NEXT:    movw %si, (%rdx)
58; SSE-NEXT:    movd %xmm5, %edx
59; SSE-NEXT:    movw %dx, (%rcx)
60; SSE-NEXT:    movd %xmm6, %ecx
61; SSE-NEXT:    movw %cx, (%r8)
62; SSE-NEXT:    movd %xmm7, %ecx
63; SSE-NEXT:    movw %cx, (%r9)
64; SSE-NEXT:    movd %xmm0, %ecx
65; SSE-NEXT:    movw %cx, (%r10)
66; SSE-NEXT:    movd %xmm2, %ecx
67; SSE-NEXT:    movw %cx, (%rax)
68; SSE-NEXT:    retq
69;
70; AVX-LABEL: load_i8_stride7_vf2:
71; AVX:       # %bb.0:
72; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
73; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r10
74; AVX-NEXT:    vmovdqa (%rdi), %xmm0
75; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
76; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
77; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
78; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
79; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
80; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
81; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
82; AVX-NEXT:    vpextrw $0, %xmm1, (%rsi)
83; AVX-NEXT:    vpextrw $0, %xmm2, (%rdx)
84; AVX-NEXT:    vpextrw $0, %xmm3, (%rcx)
85; AVX-NEXT:    vpextrw $0, %xmm4, (%r8)
86; AVX-NEXT:    vpextrw $0, %xmm5, (%r9)
87; AVX-NEXT:    vpextrw $0, %xmm6, (%r10)
88; AVX-NEXT:    vpextrw $0, %xmm0, (%rax)
89; AVX-NEXT:    retq
90;
91; AVX2-LABEL: load_i8_stride7_vf2:
92; AVX2:       # %bb.0:
93; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
94; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
95; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
96; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
97; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
98; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
99; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
100; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
101; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
102; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
103; AVX2-NEXT:    vpextrw $0, %xmm1, (%rsi)
104; AVX2-NEXT:    vpextrw $0, %xmm2, (%rdx)
105; AVX2-NEXT:    vpextrw $0, %xmm3, (%rcx)
106; AVX2-NEXT:    vpextrw $0, %xmm4, (%r8)
107; AVX2-NEXT:    vpextrw $0, %xmm5, (%r9)
108; AVX2-NEXT:    vpextrw $0, %xmm6, (%r10)
109; AVX2-NEXT:    vpextrw $0, %xmm0, (%rax)
110; AVX2-NEXT:    retq
111;
112; AVX2-FP-LABEL: load_i8_stride7_vf2:
113; AVX2-FP:       # %bb.0:
114; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
115; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
116; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
117; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
118; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
119; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
120; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
121; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
122; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
123; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
124; AVX2-FP-NEXT:    vpextrw $0, %xmm1, (%rsi)
125; AVX2-FP-NEXT:    vpextrw $0, %xmm2, (%rdx)
126; AVX2-FP-NEXT:    vpextrw $0, %xmm3, (%rcx)
127; AVX2-FP-NEXT:    vpextrw $0, %xmm4, (%r8)
128; AVX2-FP-NEXT:    vpextrw $0, %xmm5, (%r9)
129; AVX2-FP-NEXT:    vpextrw $0, %xmm6, (%r10)
130; AVX2-FP-NEXT:    vpextrw $0, %xmm0, (%rax)
131; AVX2-FP-NEXT:    retq
132;
133; AVX2-FCP-LABEL: load_i8_stride7_vf2:
134; AVX2-FCP:       # %bb.0:
135; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
136; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
137; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm0
138; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
139; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
140; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
141; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
142; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
143; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
144; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
145; AVX2-FCP-NEXT:    vpextrw $0, %xmm1, (%rsi)
146; AVX2-FCP-NEXT:    vpextrw $0, %xmm2, (%rdx)
147; AVX2-FCP-NEXT:    vpextrw $0, %xmm3, (%rcx)
148; AVX2-FCP-NEXT:    vpextrw $0, %xmm4, (%r8)
149; AVX2-FCP-NEXT:    vpextrw $0, %xmm5, (%r9)
150; AVX2-FCP-NEXT:    vpextrw $0, %xmm6, (%r10)
151; AVX2-FCP-NEXT:    vpextrw $0, %xmm0, (%rax)
152; AVX2-FCP-NEXT:    retq
153;
154; AVX512-LABEL: load_i8_stride7_vf2:
155; AVX512:       # %bb.0:
156; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
157; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
158; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
159; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
160; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
161; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
162; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
163; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
164; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
165; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
166; AVX512-NEXT:    vpextrw $0, %xmm1, (%rsi)
167; AVX512-NEXT:    vpextrw $0, %xmm2, (%rdx)
168; AVX512-NEXT:    vpextrw $0, %xmm3, (%rcx)
169; AVX512-NEXT:    vpextrw $0, %xmm4, (%r8)
170; AVX512-NEXT:    vpextrw $0, %xmm5, (%r9)
171; AVX512-NEXT:    vpextrw $0, %xmm6, (%r10)
172; AVX512-NEXT:    vpextrw $0, %xmm0, (%rax)
173; AVX512-NEXT:    retq
174;
175; AVX512-FCP-LABEL: load_i8_stride7_vf2:
176; AVX512-FCP:       # %bb.0:
177; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
178; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
179; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
180; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
181; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
182; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
183; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
184; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
185; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
186; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
187; AVX512-FCP-NEXT:    vpextrw $0, %xmm1, (%rsi)
188; AVX512-FCP-NEXT:    vpextrw $0, %xmm2, (%rdx)
189; AVX512-FCP-NEXT:    vpextrw $0, %xmm3, (%rcx)
190; AVX512-FCP-NEXT:    vpextrw $0, %xmm4, (%r8)
191; AVX512-FCP-NEXT:    vpextrw $0, %xmm5, (%r9)
192; AVX512-FCP-NEXT:    vpextrw $0, %xmm6, (%r10)
193; AVX512-FCP-NEXT:    vpextrw $0, %xmm0, (%rax)
194; AVX512-FCP-NEXT:    retq
195;
196; AVX512DQ-LABEL: load_i8_stride7_vf2:
197; AVX512DQ:       # %bb.0:
198; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
199; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
200; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
201; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
202; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
203; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
204; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
205; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
206; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
207; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
208; AVX512DQ-NEXT:    vpextrw $0, %xmm1, (%rsi)
209; AVX512DQ-NEXT:    vpextrw $0, %xmm2, (%rdx)
210; AVX512DQ-NEXT:    vpextrw $0, %xmm3, (%rcx)
211; AVX512DQ-NEXT:    vpextrw $0, %xmm4, (%r8)
212; AVX512DQ-NEXT:    vpextrw $0, %xmm5, (%r9)
213; AVX512DQ-NEXT:    vpextrw $0, %xmm6, (%r10)
214; AVX512DQ-NEXT:    vpextrw $0, %xmm0, (%rax)
215; AVX512DQ-NEXT:    retq
216;
217; AVX512DQ-FCP-LABEL: load_i8_stride7_vf2:
218; AVX512DQ-FCP:       # %bb.0:
219; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
220; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
221; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
222; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
223; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
224; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
225; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
226; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
227; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
228; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
229; AVX512DQ-FCP-NEXT:    vpextrw $0, %xmm1, (%rsi)
230; AVX512DQ-FCP-NEXT:    vpextrw $0, %xmm2, (%rdx)
231; AVX512DQ-FCP-NEXT:    vpextrw $0, %xmm3, (%rcx)
232; AVX512DQ-FCP-NEXT:    vpextrw $0, %xmm4, (%r8)
233; AVX512DQ-FCP-NEXT:    vpextrw $0, %xmm5, (%r9)
234; AVX512DQ-FCP-NEXT:    vpextrw $0, %xmm6, (%r10)
235; AVX512DQ-FCP-NEXT:    vpextrw $0, %xmm0, (%rax)
236; AVX512DQ-FCP-NEXT:    retq
237;
238; AVX512BW-LABEL: load_i8_stride7_vf2:
239; AVX512BW:       # %bb.0:
240; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
241; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
242; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
243; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
244; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
245; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
246; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
247; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
248; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
249; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
250; AVX512BW-NEXT:    vpextrw $0, %xmm1, (%rsi)
251; AVX512BW-NEXT:    vpextrw $0, %xmm2, (%rdx)
252; AVX512BW-NEXT:    vpextrw $0, %xmm3, (%rcx)
253; AVX512BW-NEXT:    vpextrw $0, %xmm4, (%r8)
254; AVX512BW-NEXT:    vpextrw $0, %xmm5, (%r9)
255; AVX512BW-NEXT:    vpextrw $0, %xmm6, (%r10)
256; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rax)
257; AVX512BW-NEXT:    retq
258;
259; AVX512BW-FCP-LABEL: load_i8_stride7_vf2:
260; AVX512BW-FCP:       # %bb.0:
261; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
262; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
263; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
264; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
265; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
266; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
267; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
268; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
269; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
270; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
271; AVX512BW-FCP-NEXT:    vpextrw $0, %xmm1, (%rsi)
272; AVX512BW-FCP-NEXT:    vpextrw $0, %xmm2, (%rdx)
273; AVX512BW-FCP-NEXT:    vpextrw $0, %xmm3, (%rcx)
274; AVX512BW-FCP-NEXT:    vpextrw $0, %xmm4, (%r8)
275; AVX512BW-FCP-NEXT:    vpextrw $0, %xmm5, (%r9)
276; AVX512BW-FCP-NEXT:    vpextrw $0, %xmm6, (%r10)
277; AVX512BW-FCP-NEXT:    vpextrw $0, %xmm0, (%rax)
278; AVX512BW-FCP-NEXT:    retq
279;
280; AVX512DQ-BW-LABEL: load_i8_stride7_vf2:
281; AVX512DQ-BW:       # %bb.0:
282; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
283; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
284; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
285; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
286; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
287; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
288; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
289; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
290; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
291; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
292; AVX512DQ-BW-NEXT:    vpextrw $0, %xmm1, (%rsi)
293; AVX512DQ-BW-NEXT:    vpextrw $0, %xmm2, (%rdx)
294; AVX512DQ-BW-NEXT:    vpextrw $0, %xmm3, (%rcx)
295; AVX512DQ-BW-NEXT:    vpextrw $0, %xmm4, (%r8)
296; AVX512DQ-BW-NEXT:    vpextrw $0, %xmm5, (%r9)
297; AVX512DQ-BW-NEXT:    vpextrw $0, %xmm6, (%r10)
298; AVX512DQ-BW-NEXT:    vpextrw $0, %xmm0, (%rax)
299; AVX512DQ-BW-NEXT:    retq
300;
301; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf2:
302; AVX512DQ-BW-FCP:       # %bb.0:
303; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
304; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
305; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
306; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
307; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
308; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
309; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
310; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
311; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
312; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
313; AVX512DQ-BW-FCP-NEXT:    vpextrw $0, %xmm1, (%rsi)
314; AVX512DQ-BW-FCP-NEXT:    vpextrw $0, %xmm2, (%rdx)
315; AVX512DQ-BW-FCP-NEXT:    vpextrw $0, %xmm3, (%rcx)
316; AVX512DQ-BW-FCP-NEXT:    vpextrw $0, %xmm4, (%r8)
317; AVX512DQ-BW-FCP-NEXT:    vpextrw $0, %xmm5, (%r9)
318; AVX512DQ-BW-FCP-NEXT:    vpextrw $0, %xmm6, (%r10)
319; AVX512DQ-BW-FCP-NEXT:    vpextrw $0, %xmm0, (%rax)
320; AVX512DQ-BW-FCP-NEXT:    retq
321  %wide.vec = load <14 x i8>, ptr %in.vec, align 64
322  %strided.vec0 = shufflevector <14 x i8> %wide.vec, <14 x i8> poison, <2 x i32> <i32 0, i32 7>
323  %strided.vec1 = shufflevector <14 x i8> %wide.vec, <14 x i8> poison, <2 x i32> <i32 1, i32 8>
324  %strided.vec2 = shufflevector <14 x i8> %wide.vec, <14 x i8> poison, <2 x i32> <i32 2, i32 9>
325  %strided.vec3 = shufflevector <14 x i8> %wide.vec, <14 x i8> poison, <2 x i32> <i32 3, i32 10>
326  %strided.vec4 = shufflevector <14 x i8> %wide.vec, <14 x i8> poison, <2 x i32> <i32 4, i32 11>
327  %strided.vec5 = shufflevector <14 x i8> %wide.vec, <14 x i8> poison, <2 x i32> <i32 5, i32 12>
328  %strided.vec6 = shufflevector <14 x i8> %wide.vec, <14 x i8> poison, <2 x i32> <i32 6, i32 13>
329  store <2 x i8> %strided.vec0, ptr %out.vec0, align 64
330  store <2 x i8> %strided.vec1, ptr %out.vec1, align 64
331  store <2 x i8> %strided.vec2, ptr %out.vec2, align 64
332  store <2 x i8> %strided.vec3, ptr %out.vec3, align 64
333  store <2 x i8> %strided.vec4, ptr %out.vec4, align 64
334  store <2 x i8> %strided.vec5, ptr %out.vec5, align 64
335  store <2 x i8> %strided.vec6, ptr %out.vec6, align 64
336  ret void
337}
338
339define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
340; SSE-LABEL: load_i8_stride7_vf4:
341; SSE:       # %bb.0:
342; SSE-NEXT:    movdqa (%rdi), %xmm4
343; SSE-NEXT:    movdqa 16(%rdi), %xmm0
344; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,65535,65535,65535]
345; SSE-NEXT:    movdqa %xmm4, %xmm1
346; SSE-NEXT:    pand %xmm3, %xmm1
347; SSE-NEXT:    pandn %xmm0, %xmm3
348; SSE-NEXT:    por %xmm1, %xmm3
349; SSE-NEXT:    pxor %xmm1, %xmm1
350; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
351; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,0,65535]
352; SSE-NEXT:    pand %xmm2, %xmm3
353; SSE-NEXT:    movdqa %xmm0, %xmm5
354; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[0,0]
355; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[2,3]
356; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,65535,65535]
357; SSE-NEXT:    movdqa %xmm0, %xmm8
358; SSE-NEXT:    pand %xmm7, %xmm8
359; SSE-NEXT:    pandn %xmm4, %xmm7
360; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,0,65535,65535]
361; SSE-NEXT:    movdqa %xmm0, %xmm9
362; SSE-NEXT:    pand %xmm6, %xmm9
363; SSE-NEXT:    pandn %xmm4, %xmm6
364; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [65535,65535,0,65535,65535,0,65535,65535]
365; SSE-NEXT:    movdqa %xmm0, %xmm14
366; SSE-NEXT:    pand %xmm13, %xmm14
367; SSE-NEXT:    pandn %xmm4, %xmm13
368; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm4[1,3,2,3]
369; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,2,2,3]
370; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,65535,65535,0,65535]
371; SSE-NEXT:    pand %xmm11, %xmm0
372; SSE-NEXT:    pandn %xmm4, %xmm11
373; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
374; SSE-NEXT:    pandn %xmm4, %xmm2
375; SSE-NEXT:    por %xmm3, %xmm2
376; SSE-NEXT:    movdqa {{.*#+}} xmm15 = [0,65535,65535,65535,65535,65535,65535,0]
377; SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
378; SSE-NEXT:    pand %xmm15, %xmm5
379; SSE-NEXT:    pandn %xmm4, %xmm15
380; SSE-NEXT:    por %xmm5, %xmm15
381; SSE-NEXT:    por %xmm8, %xmm7
382; SSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
383; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm7[0,2,2,3,4,5,6,7]
384; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
385; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm3[1,1,0,3,4,5,6,7]
386; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535]
387; SSE-NEXT:    pand %xmm3, %xmm5
388; SSE-NEXT:    pandn %xmm4, %xmm3
389; SSE-NEXT:    por %xmm5, %xmm3
390; SSE-NEXT:    por %xmm9, %xmm6
391; SSE-NEXT:    movdqa %xmm6, %xmm4
392; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
393; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
394; SSE-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
395; SSE-NEXT:    por %xmm14, %xmm13
396; SSE-NEXT:    movdqa %xmm13, %xmm4
397; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
398; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm4[3,1,2,3,4,5,6,7]
399; SSE-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3],xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
400; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm13[2,1,2,3]
401; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
402; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
403; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
404; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
405; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
406; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
407; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
408; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7]
409; SSE-NEXT:    packuswb %xmm2, %xmm2
410; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm15[0,3,2,3]
411; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7]
412; SSE-NEXT:    packuswb %xmm5, %xmm5
413; SSE-NEXT:    packuswb %xmm3, %xmm3
414; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7]
415; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7]
416; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,0,2,3]
417; SSE-NEXT:    packuswb %xmm6, %xmm6
418; SSE-NEXT:    packuswb %xmm4, %xmm4
419; SSE-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1]
420; SSE-NEXT:    movdqa %xmm10, %xmm7
421; SSE-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
422; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
423; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7]
424; SSE-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
425; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm10[2,1,2,3]
426; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
427; SSE-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
428; SSE-NEXT:    packuswb %xmm8, %xmm8
429; SSE-NEXT:    por %xmm0, %xmm11
430; SSE-NEXT:    movdqa %xmm11, %xmm0
431; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
432; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
433; SSE-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15]
434; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[0,1,2,1]
435; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7]
436; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
437; SSE-NEXT:    packuswb %xmm0, %xmm0
438; SSE-NEXT:    movd %xmm2, (%rsi)
439; SSE-NEXT:    movd %xmm5, (%rdx)
440; SSE-NEXT:    movd %xmm3, (%rcx)
441; SSE-NEXT:    movd %xmm6, (%r8)
442; SSE-NEXT:    movd %xmm4, (%r9)
443; SSE-NEXT:    movd %xmm8, (%rdi)
444; SSE-NEXT:    movd %xmm0, (%rax)
445; SSE-NEXT:    retq
446;
447; AVX-LABEL: load_i8_stride7_vf4:
448; AVX:       # %bb.0:
449; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
450; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r10
451; AVX-NEXT:    vmovdqa (%rdi), %xmm0
452; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
453; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
454; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
455; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
456; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
457; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
458; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
459; AVX-NEXT:    vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
460; AVX-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
461; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
462; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
463; AVX-NEXT:    vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
464; AVX-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
465; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
466; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
467; AVX-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
468; AVX-NEXT:    vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
469; AVX-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
470; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
471; AVX-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
472; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
473; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
474; AVX-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
475; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
476; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
477; AVX-NEXT:    vmovd %xmm2, (%rsi)
478; AVX-NEXT:    vmovd %xmm3, (%rdx)
479; AVX-NEXT:    vmovd %xmm5, (%rcx)
480; AVX-NEXT:    vmovd %xmm7, (%r8)
481; AVX-NEXT:    vmovd %xmm4, (%r9)
482; AVX-NEXT:    vmovd %xmm6, (%r10)
483; AVX-NEXT:    vmovd %xmm0, (%rax)
484; AVX-NEXT:    retq
485;
486; AVX2-LABEL: load_i8_stride7_vf4:
487; AVX2:       # %bb.0:
488; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
489; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
490; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
491; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
492; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
493; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
494; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
495; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
496; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
497; AVX2-NEXT:    vpor %xmm3, %xmm4, %xmm3
498; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
499; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
500; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
501; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
502; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
503; AVX2-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
504; AVX2-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
505; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
506; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
507; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
508; AVX2-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
509; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
510; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
511; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
512; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
513; AVX2-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
514; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
515; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
516; AVX2-NEXT:    vmovd %xmm2, (%rsi)
517; AVX2-NEXT:    vmovd %xmm3, (%rdx)
518; AVX2-NEXT:    vmovd %xmm5, (%rcx)
519; AVX2-NEXT:    vmovd %xmm7, (%r8)
520; AVX2-NEXT:    vmovd %xmm4, (%r9)
521; AVX2-NEXT:    vmovd %xmm6, (%r10)
522; AVX2-NEXT:    vmovd %xmm0, (%rax)
523; AVX2-NEXT:    retq
524;
525; AVX2-FP-LABEL: load_i8_stride7_vf4:
526; AVX2-FP:       # %bb.0:
527; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
528; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
529; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
530; AVX2-FP-NEXT:    vmovdqa 16(%rdi), %xmm1
531; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
532; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
533; AVX2-FP-NEXT:    vpor %xmm2, %xmm3, %xmm2
534; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
535; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
536; AVX2-FP-NEXT:    vpor %xmm3, %xmm4, %xmm3
537; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
538; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
539; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
540; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
541; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
542; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
543; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
544; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
545; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
546; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
547; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
548; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
549; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
550; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
551; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
552; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
553; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
554; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
555; AVX2-FP-NEXT:    vmovd %xmm2, (%rsi)
556; AVX2-FP-NEXT:    vmovd %xmm3, (%rdx)
557; AVX2-FP-NEXT:    vmovd %xmm5, (%rcx)
558; AVX2-FP-NEXT:    vmovd %xmm7, (%r8)
559; AVX2-FP-NEXT:    vmovd %xmm4, (%r9)
560; AVX2-FP-NEXT:    vmovd %xmm6, (%r10)
561; AVX2-FP-NEXT:    vmovd %xmm0, (%rax)
562; AVX2-FP-NEXT:    retq
563;
564; AVX2-FCP-LABEL: load_i8_stride7_vf4:
565; AVX2-FCP:       # %bb.0:
566; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
567; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
568; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm0
569; AVX2-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
570; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
571; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
572; AVX2-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
573; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
574; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
575; AVX2-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
576; AVX2-FCP-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
577; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
578; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
579; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
580; AVX2-FCP-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
581; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
582; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
583; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
584; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
585; AVX2-FCP-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
586; AVX2-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
587; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
588; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
589; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
590; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
591; AVX2-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
592; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
593; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
594; AVX2-FCP-NEXT:    vmovd %xmm2, (%rsi)
595; AVX2-FCP-NEXT:    vmovd %xmm3, (%rdx)
596; AVX2-FCP-NEXT:    vmovd %xmm5, (%rcx)
597; AVX2-FCP-NEXT:    vmovd %xmm7, (%r8)
598; AVX2-FCP-NEXT:    vmovd %xmm4, (%r9)
599; AVX2-FCP-NEXT:    vmovd %xmm6, (%r10)
600; AVX2-FCP-NEXT:    vmovd %xmm0, (%rax)
601; AVX2-FCP-NEXT:    retq
602;
603; AVX512-LABEL: load_i8_stride7_vf4:
604; AVX512:       # %bb.0:
605; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
606; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
607; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
608; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
609; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
610; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
611; AVX512-NEXT:    vpor %xmm2, %xmm3, %xmm2
612; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
613; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
614; AVX512-NEXT:    vpor %xmm3, %xmm4, %xmm3
615; AVX512-NEXT:    vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
616; AVX512-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
617; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
618; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
619; AVX512-NEXT:    vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
620; AVX512-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
621; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
622; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
623; AVX512-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
624; AVX512-NEXT:    vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
625; AVX512-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
626; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
627; AVX512-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
628; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
629; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
630; AVX512-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
631; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
632; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
633; AVX512-NEXT:    vmovd %xmm2, (%rsi)
634; AVX512-NEXT:    vmovd %xmm3, (%rdx)
635; AVX512-NEXT:    vmovd %xmm5, (%rcx)
636; AVX512-NEXT:    vmovd %xmm7, (%r8)
637; AVX512-NEXT:    vmovd %xmm4, (%r9)
638; AVX512-NEXT:    vmovd %xmm6, (%r10)
639; AVX512-NEXT:    vmovd %xmm0, (%rax)
640; AVX512-NEXT:    retq
641;
642; AVX512-FCP-LABEL: load_i8_stride7_vf4:
643; AVX512-FCP:       # %bb.0:
644; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
645; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
646; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
647; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
648; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
649; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
650; AVX512-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
651; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
652; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
653; AVX512-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
654; AVX512-FCP-NEXT:    vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
655; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
656; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
657; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
658; AVX512-FCP-NEXT:    vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
659; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
660; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
661; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
662; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
663; AVX512-FCP-NEXT:    vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
664; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
665; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
666; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
667; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
668; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
669; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
670; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
671; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
672; AVX512-FCP-NEXT:    vmovd %xmm2, (%rsi)
673; AVX512-FCP-NEXT:    vmovd %xmm3, (%rdx)
674; AVX512-FCP-NEXT:    vmovd %xmm5, (%rcx)
675; AVX512-FCP-NEXT:    vmovd %xmm7, (%r8)
676; AVX512-FCP-NEXT:    vmovd %xmm4, (%r9)
677; AVX512-FCP-NEXT:    vmovd %xmm6, (%r10)
678; AVX512-FCP-NEXT:    vmovd %xmm0, (%rax)
679; AVX512-FCP-NEXT:    retq
680;
681; AVX512DQ-LABEL: load_i8_stride7_vf4:
682; AVX512DQ:       # %bb.0:
683; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
684; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
685; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
686; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm1
687; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
688; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
689; AVX512DQ-NEXT:    vpor %xmm2, %xmm3, %xmm2
690; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
691; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
692; AVX512DQ-NEXT:    vpor %xmm3, %xmm4, %xmm3
693; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
694; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
695; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
696; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
697; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
698; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
699; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
700; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
701; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
702; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
703; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
704; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
705; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
706; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
707; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
708; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
709; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
710; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
711; AVX512DQ-NEXT:    vmovd %xmm2, (%rsi)
712; AVX512DQ-NEXT:    vmovd %xmm3, (%rdx)
713; AVX512DQ-NEXT:    vmovd %xmm5, (%rcx)
714; AVX512DQ-NEXT:    vmovd %xmm7, (%r8)
715; AVX512DQ-NEXT:    vmovd %xmm4, (%r9)
716; AVX512DQ-NEXT:    vmovd %xmm6, (%r10)
717; AVX512DQ-NEXT:    vmovd %xmm0, (%rax)
718; AVX512DQ-NEXT:    retq
719;
720; AVX512DQ-FCP-LABEL: load_i8_stride7_vf4:
721; AVX512DQ-FCP:       # %bb.0:
722; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
723; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
724; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
725; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
726; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
727; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
728; AVX512DQ-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
729; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
730; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
731; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
732; AVX512DQ-FCP-NEXT:    vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
733; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
734; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
735; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
736; AVX512DQ-FCP-NEXT:    vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
737; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
738; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
739; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
740; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
741; AVX512DQ-FCP-NEXT:    vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
742; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
743; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
744; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
745; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
746; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
747; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
748; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
749; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
750; AVX512DQ-FCP-NEXT:    vmovd %xmm2, (%rsi)
751; AVX512DQ-FCP-NEXT:    vmovd %xmm3, (%rdx)
752; AVX512DQ-FCP-NEXT:    vmovd %xmm5, (%rcx)
753; AVX512DQ-FCP-NEXT:    vmovd %xmm7, (%r8)
754; AVX512DQ-FCP-NEXT:    vmovd %xmm4, (%r9)
755; AVX512DQ-FCP-NEXT:    vmovd %xmm6, (%r10)
756; AVX512DQ-FCP-NEXT:    vmovd %xmm0, (%rax)
757; AVX512DQ-FCP-NEXT:    retq
758;
759; AVX512BW-LABEL: load_i8_stride7_vf4:
760; AVX512BW:       # %bb.0:
761; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
762; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
763; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
764; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
765; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
766; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
767; AVX512BW-NEXT:    vpor %xmm2, %xmm3, %xmm2
768; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
769; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
770; AVX512BW-NEXT:    vpor %xmm3, %xmm4, %xmm3
771; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
772; AVX512BW-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
773; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
774; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
775; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
776; AVX512BW-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
777; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
778; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
779; AVX512BW-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
780; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
781; AVX512BW-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
782; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
783; AVX512BW-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
784; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
785; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
786; AVX512BW-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
787; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
788; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
789; AVX512BW-NEXT:    vmovd %xmm2, (%rsi)
790; AVX512BW-NEXT:    vmovd %xmm3, (%rdx)
791; AVX512BW-NEXT:    vmovd %xmm5, (%rcx)
792; AVX512BW-NEXT:    vmovd %xmm7, (%r8)
793; AVX512BW-NEXT:    vmovd %xmm4, (%r9)
794; AVX512BW-NEXT:    vmovd %xmm6, (%r10)
795; AVX512BW-NEXT:    vmovd %xmm0, (%rax)
796; AVX512BW-NEXT:    retq
797;
798; AVX512BW-FCP-LABEL: load_i8_stride7_vf4:
799; AVX512BW-FCP:       # %bb.0:
800; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
801; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
802; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
803; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
804; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
805; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
806; AVX512BW-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
807; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
808; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
809; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
810; AVX512BW-FCP-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
811; AVX512BW-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
812; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
813; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
814; AVX512BW-FCP-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
815; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
816; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
817; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
818; AVX512BW-FCP-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
819; AVX512BW-FCP-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
820; AVX512BW-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
821; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
822; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
823; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
824; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
825; AVX512BW-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
826; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
827; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
828; AVX512BW-FCP-NEXT:    vmovd %xmm2, (%rsi)
829; AVX512BW-FCP-NEXT:    vmovd %xmm3, (%rdx)
830; AVX512BW-FCP-NEXT:    vmovd %xmm5, (%rcx)
831; AVX512BW-FCP-NEXT:    vmovd %xmm7, (%r8)
832; AVX512BW-FCP-NEXT:    vmovd %xmm4, (%r9)
833; AVX512BW-FCP-NEXT:    vmovd %xmm6, (%r10)
834; AVX512BW-FCP-NEXT:    vmovd %xmm0, (%rax)
835; AVX512BW-FCP-NEXT:    retq
836;
837; AVX512DQ-BW-LABEL: load_i8_stride7_vf4:
838; AVX512DQ-BW:       # %bb.0:
839; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
840; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
841; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
842; AVX512DQ-BW-NEXT:    vmovdqa 16(%rdi), %xmm1
843; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
844; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
845; AVX512DQ-BW-NEXT:    vpor %xmm2, %xmm3, %xmm2
846; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
847; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
848; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm4, %xmm3
849; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
850; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
851; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
852; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
853; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
854; AVX512DQ-BW-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
855; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
856; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
857; AVX512DQ-BW-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
858; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
859; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
860; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
861; AVX512DQ-BW-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
862; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
863; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
864; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
865; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
866; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
867; AVX512DQ-BW-NEXT:    vmovd %xmm2, (%rsi)
868; AVX512DQ-BW-NEXT:    vmovd %xmm3, (%rdx)
869; AVX512DQ-BW-NEXT:    vmovd %xmm5, (%rcx)
870; AVX512DQ-BW-NEXT:    vmovd %xmm7, (%r8)
871; AVX512DQ-BW-NEXT:    vmovd %xmm4, (%r9)
872; AVX512DQ-BW-NEXT:    vmovd %xmm6, (%r10)
873; AVX512DQ-BW-NEXT:    vmovd %xmm0, (%rax)
874; AVX512DQ-BW-NEXT:    retq
875;
876; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf4:
877; AVX512DQ-BW-FCP:       # %bb.0:
878; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
879; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
880; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
881; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
882; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
883; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
884; AVX512DQ-BW-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
885; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
886; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
887; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
888; AVX512DQ-BW-FCP-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
889; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
890; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
891; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
892; AVX512DQ-BW-FCP-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
893; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
894; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
895; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
896; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
897; AVX512DQ-BW-FCP-NEXT:    vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
898; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
899; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
900; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
901; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
902; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
903; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
904; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
905; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
906; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm2, (%rsi)
907; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm3, (%rdx)
908; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm5, (%rcx)
909; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm7, (%r8)
910; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm4, (%r9)
911; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm6, (%r10)
912; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm0, (%rax)
913; AVX512DQ-BW-FCP-NEXT:    retq
914  %wide.vec = load <28 x i8>, ptr %in.vec, align 64
915  %strided.vec0 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 0, i32 7, i32 14, i32 21>
916  %strided.vec1 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 1, i32 8, i32 15, i32 22>
917  %strided.vec2 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 2, i32 9, i32 16, i32 23>
918  %strided.vec3 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 3, i32 10, i32 17, i32 24>
919  %strided.vec4 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 4, i32 11, i32 18, i32 25>
920  %strided.vec5 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 5, i32 12, i32 19, i32 26>
921  %strided.vec6 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 6, i32 13, i32 20, i32 27>
922  store <4 x i8> %strided.vec0, ptr %out.vec0, align 64
923  store <4 x i8> %strided.vec1, ptr %out.vec1, align 64
924  store <4 x i8> %strided.vec2, ptr %out.vec2, align 64
925  store <4 x i8> %strided.vec3, ptr %out.vec3, align 64
926  store <4 x i8> %strided.vec4, ptr %out.vec4, align 64
927  store <4 x i8> %strided.vec5, ptr %out.vec5, align 64
928  store <4 x i8> %strided.vec6, ptr %out.vec6, align 64
929  ret void
930}
931
932define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
933; SSE-LABEL: load_i8_stride7_vf8:
934; SSE:       # %bb.0:
935; SSE-NEXT:    movdqa (%rdi), %xmm3
936; SSE-NEXT:    movdqa 16(%rdi), %xmm11
937; SSE-NEXT:    movdqa 32(%rdi), %xmm6
938; SSE-NEXT:    movdqa 48(%rdi), %xmm0
939; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,0,65535]
940; SSE-NEXT:    movdqa %xmm3, %xmm1
941; SSE-NEXT:    pand %xmm2, %xmm1
942; SSE-NEXT:    pandn %xmm11, %xmm2
943; SSE-NEXT:    por %xmm1, %xmm2
944; SSE-NEXT:    pxor %xmm1, %xmm1
945; SSE-NEXT:    movdqa %xmm2, %xmm5
946; SSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
947; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,65535]
948; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
949; SSE-NEXT:    pxor %xmm4, %xmm4
950; SSE-NEXT:    pand %xmm7, %xmm2
951; SSE-NEXT:    pandn %xmm5, %xmm7
952; SSE-NEXT:    por %xmm2, %xmm7
953; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[0,2,1,3]
954; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
955; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,1,1]
956; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm2[0,3,2,1,4,5,6,7]
957; SSE-NEXT:    packuswb %xmm7, %xmm7
958; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255]
959; SSE-NEXT:    pand %xmm2, %xmm7
960; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [0,65535,65535,65535,65535,65535,65535,65535]
961; SSE-NEXT:    movdqa %xmm6, %xmm5
962; SSE-NEXT:    pand %xmm9, %xmm5
963; SSE-NEXT:    pandn %xmm0, %xmm9
964; SSE-NEXT:    por %xmm5, %xmm9
965; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
966; SSE-NEXT:    movdqa %xmm6, %xmm8
967; SSE-NEXT:    movss {{.*#+}} xmm8 = xmm0[0],xmm8[1,2,3]
968; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535]
969; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
970; SSE-NEXT:    movdqa %xmm11, %xmm10
971; SSE-NEXT:    movdqa %xmm11, %xmm1
972; SSE-NEXT:    pand %xmm5, %xmm10
973; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [65535,65535,0,65535,65535,65535,65535,65535]
974; SSE-NEXT:    movdqa %xmm6, %xmm4
975; SSE-NEXT:    pand %xmm12, %xmm4
976; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
977; SSE-NEXT:    pandn %xmm0, %xmm12
978; SSE-NEXT:    movaps %xmm0, %xmm14
979; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[1,0],xmm6[0,0]
980; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[2,0],xmm6[2,3]
981; SSE-NEXT:    pand %xmm5, %xmm0
982; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
983; SSE-NEXT:    pandn %xmm6, %xmm5
984; SSE-NEXT:    movdqa %xmm6, %xmm15
985; SSE-NEXT:    pxor %xmm0, %xmm0
986; SSE-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15]
987; SSE-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3]
988; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,6,5,6,7]
989; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
990; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,4,4,5,6]
991; SSE-NEXT:    packuswb %xmm9, %xmm9
992; SSE-NEXT:    movdqa %xmm2, %xmm11
993; SSE-NEXT:    movdqa %xmm2, %xmm13
994; SSE-NEXT:    pandn %xmm9, %xmm13
995; SSE-NEXT:    por %xmm7, %xmm13
996; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,0,65535]
997; SSE-NEXT:    movdqa %xmm7, %xmm9
998; SSE-NEXT:    movdqa %xmm1, %xmm4
999; SSE-NEXT:    pandn %xmm1, %xmm9
1000; SSE-NEXT:    movdqa %xmm3, %xmm2
1001; SSE-NEXT:    pand %xmm7, %xmm3
1002; SSE-NEXT:    por %xmm9, %xmm3
1003; SSE-NEXT:    movdqa %xmm3, %xmm9
1004; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
1005; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,0,65535]
1006; SSE-NEXT:    movdqa %xmm0, %xmm1
1007; SSE-NEXT:    pandn %xmm9, %xmm1
1008; SSE-NEXT:    pxor %xmm6, %xmm6
1009; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15]
1010; SSE-NEXT:    pand %xmm0, %xmm3
1011; SSE-NEXT:    por %xmm1, %xmm3
1012; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
1013; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
1014; SSE-NEXT:    movdqa %xmm8, %xmm9
1015; SSE-NEXT:    pand %xmm1, %xmm9
1016; SSE-NEXT:    pandn %xmm15, %xmm1
1017; SSE-NEXT:    por %xmm9, %xmm1
1018; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1019; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
1020; SSE-NEXT:    packuswb %xmm1, %xmm1
1021; SSE-NEXT:    movdqa %xmm11, %xmm9
1022; SSE-NEXT:    pandn %xmm1, %xmm9
1023; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3]
1024; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
1025; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
1026; SSE-NEXT:    packuswb %xmm1, %xmm1
1027; SSE-NEXT:    pand %xmm11, %xmm1
1028; SSE-NEXT:    por %xmm1, %xmm9
1029; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1030; SSE-NEXT:    pandn %xmm2, %xmm1
1031; SSE-NEXT:    por %xmm1, %xmm10
1032; SSE-NEXT:    movdqa %xmm10, %xmm1
1033; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
1034; SSE-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
1035; SSE-NEXT:    pand %xmm0, %xmm10
1036; SSE-NEXT:    pandn %xmm1, %xmm0
1037; SSE-NEXT:    por %xmm10, %xmm0
1038; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1039; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
1040; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,3,3]
1041; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
1042; SSE-NEXT:    packuswb %xmm0, %xmm0
1043; SSE-NEXT:    pand %xmm11, %xmm0
1044; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535]
1045; SSE-NEXT:    pand %xmm1, %xmm8
1046; SSE-NEXT:    pandn %xmm15, %xmm1
1047; SSE-NEXT:    por %xmm8, %xmm1
1048; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1049; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
1050; SSE-NEXT:    packuswb %xmm1, %xmm1
1051; SSE-NEXT:    movdqa %xmm11, %xmm8
1052; SSE-NEXT:    pandn %xmm1, %xmm8
1053; SSE-NEXT:    por %xmm0, %xmm8
1054; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,0,65535,65535]
1055; SSE-NEXT:    movdqa %xmm4, %xmm1
1056; SSE-NEXT:    pand %xmm0, %xmm1
1057; SSE-NEXT:    pandn %xmm2, %xmm0
1058; SSE-NEXT:    movdqa %xmm2, %xmm10
1059; SSE-NEXT:    por %xmm1, %xmm0
1060; SSE-NEXT:    movdqa %xmm0, %xmm1
1061; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
1062; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
1063; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,65535,65535,65535]
1064; SSE-NEXT:    pand %xmm3, %xmm0
1065; SSE-NEXT:    pandn %xmm1, %xmm3
1066; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1067; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
1068; SSE-NEXT:    por %xmm0, %xmm3
1069; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm3[3,2,1,0,4,5,6,7]
1070; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
1071; SSE-NEXT:    packuswb %xmm0, %xmm0
1072; SSE-NEXT:    pand %xmm11, %xmm0
1073; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
1074; SSE-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
1075; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,7,5,6,7]
1076; SSE-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
1077; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm12[0,3,2,3,4,5,6,7]
1078; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
1079; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6]
1080; SSE-NEXT:    packuswb %xmm3, %xmm3
1081; SSE-NEXT:    pandn %xmm3, %xmm11
1082; SSE-NEXT:    por %xmm0, %xmm11
1083; SSE-NEXT:    movdqa %xmm11, %xmm6
1084; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535]
1085; SSE-NEXT:    movdqa %xmm4, %xmm2
1086; SSE-NEXT:    movdqa %xmm4, %xmm3
1087; SSE-NEXT:    pand %xmm0, %xmm3
1088; SSE-NEXT:    movdqa %xmm10, %xmm11
1089; SSE-NEXT:    pandn %xmm10, %xmm0
1090; SSE-NEXT:    por %xmm3, %xmm0
1091; SSE-NEXT:    movdqa %xmm0, %xmm3
1092; SSE-NEXT:    pxor %xmm4, %xmm4
1093; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
1094; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
1095; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1096; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
1097; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1098; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1099; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
1100; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,5]
1101; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,0,65535]
1102; SSE-NEXT:    pand %xmm3, %xmm1
1103; SSE-NEXT:    pandn %xmm15, %xmm3
1104; SSE-NEXT:    por %xmm1, %xmm3
1105; SSE-NEXT:    packuswb %xmm3, %xmm0
1106; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,3,2,3]
1107; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[1,3,2,3]
1108; SSE-NEXT:    movdqa %xmm11, %xmm3
1109; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1110; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1111; SSE-NEXT:    movdqa %xmm1, %xmm0
1112; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
1113; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
1114; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1115; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1116; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
1117; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
1118; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1119; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0]
1120; SSE-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3],xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7]
1121; SSE-NEXT:    pand %xmm0, %xmm14
1122; SSE-NEXT:    pandn %xmm15, %xmm0
1123; SSE-NEXT:    por %xmm14, %xmm0
1124; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
1125; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
1126; SSE-NEXT:    packuswb %xmm0, %xmm1
1127; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm1[0,3,2,3]
1128; SSE-NEXT:    movdqa %xmm2, %xmm0
1129; SSE-NEXT:    pand %xmm7, %xmm0
1130; SSE-NEXT:    pandn %xmm3, %xmm7
1131; SSE-NEXT:    por %xmm0, %xmm7
1132; SSE-NEXT:    movdqa %xmm7, %xmm0
1133; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1134; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
1135; SSE-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
1136; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[0,1,2,1]
1137; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7]
1138; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1139; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
1140; SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
1141; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535]
1142; SSE-NEXT:    pand %xmm1, %xmm5
1143; SSE-NEXT:    pandn %xmm15, %xmm1
1144; SSE-NEXT:    por %xmm5, %xmm1
1145; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
1146; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
1147; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
1148; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
1149; SSE-NEXT:    packuswb %xmm1, %xmm0
1150; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
1151; SSE-NEXT:    movq %xmm13, (%rsi)
1152; SSE-NEXT:    movq %xmm9, (%rdx)
1153; SSE-NEXT:    movq %xmm8, (%rcx)
1154; SSE-NEXT:    movq %xmm6, (%r8)
1155; SSE-NEXT:    movq %xmm10, (%r9)
1156; SSE-NEXT:    movq %xmm11, (%rdi)
1157; SSE-NEXT:    movq %xmm0, (%rax)
1158; SSE-NEXT:    retq
1159;
1160; AVX-LABEL: load_i8_stride7_vf8:
1161; AVX:       # %bb.0:
1162; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1163; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1164; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1165; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
1166; AVX-NEXT:    vmovdqa 32(%rdi), %xmm2
1167; AVX-NEXT:    vmovdqa 48(%rdi), %xmm3
1168; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[5,12,u,u,u,u,u,u,u,u,u,u,u]
1169; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
1170; AVX-NEXT:    vpor %xmm4, %xmm5, %xmm4
1171; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm2[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2]
1172; AVX-NEXT:    vbroadcastss {{.*#+}} xmm6 = [0,0,7,14,0,0,7,14,0,0,7,14,0,0,7,14]
1173; AVX-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
1174; AVX-NEXT:    vmovddup {{.*#+}} xmm7 = [255,255,255,255,255,0,0,0,255,255,255,255,255,0,0,0]
1175; AVX-NEXT:    # xmm7 = mem[0,0]
1176; AVX-NEXT:    vpblendvb %xmm7, %xmm4, %xmm5, %xmm4
1177; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u,u,u,u,u]
1178; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
1179; AVX-NEXT:    vpor %xmm5, %xmm8, %xmm5
1180; AVX-NEXT:    vpalignr {{.*#+}} xmm8 = xmm2[4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3]
1181; AVX-NEXT:    vpshufb %xmm6, %xmm8, %xmm8
1182; AVX-NEXT:    vpblendvb %xmm7, %xmm5, %xmm8, %xmm5
1183; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
1184; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
1185; AVX-NEXT:    vpor %xmm8, %xmm9, %xmm8
1186; AVX-NEXT:    vpalignr {{.*#+}} xmm9 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
1187; AVX-NEXT:    vpshufb %xmm6, %xmm9, %xmm9
1188; AVX-NEXT:    vpblendvb %xmm7, %xmm8, %xmm9, %xmm8
1189; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
1190; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm1[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
1191; AVX-NEXT:    vpor %xmm9, %xmm10, %xmm9
1192; AVX-NEXT:    vpalignr {{.*#+}} xmm10 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5]
1193; AVX-NEXT:    vpshufb %xmm6, %xmm10, %xmm6
1194; AVX-NEXT:    vpblendvb %xmm7, %xmm9, %xmm6, %xmm6
1195; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
1196; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
1197; AVX-NEXT:    vpor %xmm7, %xmm9, %xmm7
1198; AVX-NEXT:    vmovd {{.*#+}} xmm9 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
1199; AVX-NEXT:    vpshufb %xmm9, %xmm0, %xmm10
1200; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm1[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
1201; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1202; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3],xmm10[4,5,6,7]
1203; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
1204; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,u,1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
1205; AVX-NEXT:    vpor %xmm10, %xmm11, %xmm10
1206; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm1[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
1207; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
1208; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
1209; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7]
1210; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
1211; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
1212; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1213; AVX-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
1214; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
1215; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1216; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
1217; AVX-NEXT:    vmovq %xmm4, (%rsi)
1218; AVX-NEXT:    vmovq %xmm5, (%rdx)
1219; AVX-NEXT:    vmovq %xmm8, (%rcx)
1220; AVX-NEXT:    vmovq %xmm6, (%r8)
1221; AVX-NEXT:    vmovq %xmm7, (%r9)
1222; AVX-NEXT:    vmovq %xmm10, (%r10)
1223; AVX-NEXT:    vmovq %xmm0, (%rax)
1224; AVX-NEXT:    retq
1225;
1226; AVX2-LABEL: load_i8_stride7_vf8:
1227; AVX2:       # %bb.0:
1228; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1229; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1230; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
1231; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
1232; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
1233; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
1234; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
1235; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
1236; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
1237; AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
1238; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
1239; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
1240; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
1241; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
1242; AVX2-NEXT:    vpor %xmm4, %xmm3, %xmm3
1243; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
1244; AVX2-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm4
1245; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
1246; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
1247; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
1248; AVX2-NEXT:    vpor %xmm5, %xmm4, %xmm4
1249; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
1250; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
1251; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
1252; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
1253; AVX2-NEXT:    vpor %xmm6, %xmm5, %xmm5
1254; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0]
1255; AVX2-NEXT:    vpblendvb %ymm6, %ymm1, %ymm0, %ymm6
1256; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7
1257; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
1258; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
1259; AVX2-NEXT:    vpor %xmm7, %xmm6, %xmm6
1260; AVX2-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
1261; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm8
1262; AVX2-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
1263; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
1264; AVX2-NEXT:    vpor %xmm7, %xmm8, %xmm7
1265; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0]
1266; AVX2-NEXT:    vpblendvb %ymm8, %ymm1, %ymm0, %ymm0
1267; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1268; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
1269; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
1270; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1271; AVX2-NEXT:    vmovq %xmm2, (%rsi)
1272; AVX2-NEXT:    vmovq %xmm3, (%rdx)
1273; AVX2-NEXT:    vmovq %xmm4, (%rcx)
1274; AVX2-NEXT:    vmovq %xmm5, (%r8)
1275; AVX2-NEXT:    vmovq %xmm6, (%r9)
1276; AVX2-NEXT:    vmovq %xmm7, (%r10)
1277; AVX2-NEXT:    vmovq %xmm0, (%rax)
1278; AVX2-NEXT:    vzeroupper
1279; AVX2-NEXT:    retq
1280;
1281; AVX2-FP-LABEL: load_i8_stride7_vf8:
1282; AVX2-FP:       # %bb.0:
1283; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1284; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1285; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm0
1286; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm1
1287; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
1288; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
1289; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm3
1290; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
1291; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
1292; AVX2-FP-NEXT:    vpor %xmm3, %xmm2, %xmm2
1293; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
1294; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm4
1295; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
1296; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
1297; AVX2-FP-NEXT:    vpor %xmm4, %xmm3, %xmm3
1298; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
1299; AVX2-FP-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm4
1300; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm5
1301; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
1302; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
1303; AVX2-FP-NEXT:    vpor %xmm5, %xmm4, %xmm4
1304; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
1305; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm6
1306; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
1307; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
1308; AVX2-FP-NEXT:    vpor %xmm6, %xmm5, %xmm5
1309; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0]
1310; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm1, %ymm0, %ymm6
1311; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm7
1312; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
1313; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
1314; AVX2-FP-NEXT:    vpor %xmm7, %xmm6, %xmm6
1315; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
1316; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm8
1317; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
1318; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
1319; AVX2-FP-NEXT:    vpor %xmm7, %xmm8, %xmm7
1320; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0]
1321; AVX2-FP-NEXT:    vpblendvb %ymm8, %ymm1, %ymm0, %ymm0
1322; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm1
1323; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
1324; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
1325; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1326; AVX2-FP-NEXT:    vmovq %xmm2, (%rsi)
1327; AVX2-FP-NEXT:    vmovq %xmm3, (%rdx)
1328; AVX2-FP-NEXT:    vmovq %xmm4, (%rcx)
1329; AVX2-FP-NEXT:    vmovq %xmm5, (%r8)
1330; AVX2-FP-NEXT:    vmovq %xmm6, (%r9)
1331; AVX2-FP-NEXT:    vmovq %xmm7, (%r10)
1332; AVX2-FP-NEXT:    vmovq %xmm0, (%rax)
1333; AVX2-FP-NEXT:    vzeroupper
1334; AVX2-FP-NEXT:    retq
1335;
1336; AVX2-FCP-LABEL: load_i8_stride7_vf8:
1337; AVX2-FCP:       # %bb.0:
1338; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1339; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1340; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm0
1341; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
1342; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
1343; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
1344; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
1345; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
1346; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
1347; AVX2-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
1348; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
1349; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
1350; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
1351; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
1352; AVX2-FCP-NEXT:    vpor %xmm4, %xmm3, %xmm3
1353; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
1354; AVX2-FCP-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm4
1355; AVX2-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
1356; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
1357; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
1358; AVX2-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
1359; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
1360; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
1361; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
1362; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
1363; AVX2-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
1364; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0]
1365; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm1, %ymm0, %ymm6
1366; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
1367; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
1368; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
1369; AVX2-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
1370; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
1371; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
1372; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
1373; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
1374; AVX2-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
1375; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0]
1376; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm1, %ymm0, %ymm0
1377; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
1378; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
1379; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
1380; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1381; AVX2-FCP-NEXT:    vmovq %xmm2, (%rsi)
1382; AVX2-FCP-NEXT:    vmovq %xmm3, (%rdx)
1383; AVX2-FCP-NEXT:    vmovq %xmm4, (%rcx)
1384; AVX2-FCP-NEXT:    vmovq %xmm5, (%r8)
1385; AVX2-FCP-NEXT:    vmovq %xmm6, (%r9)
1386; AVX2-FCP-NEXT:    vmovq %xmm7, (%r10)
1387; AVX2-FCP-NEXT:    vmovq %xmm0, (%rax)
1388; AVX2-FCP-NEXT:    vzeroupper
1389; AVX2-FCP-NEXT:    retq
1390;
1391; AVX512-LABEL: load_i8_stride7_vf8:
1392; AVX512:       # %bb.0:
1393; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1394; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1395; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
1396; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm1
1397; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
1398; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1))
1399; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
1400; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
1401; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
1402; AVX512-NEXT:    vpor %xmm3, %xmm2, %xmm2
1403; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
1404; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm4
1405; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
1406; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
1407; AVX512-NEXT:    vpor %xmm4, %xmm3, %xmm3
1408; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535]
1409; AVX512-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1))
1410; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm5
1411; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
1412; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
1413; AVX512-NEXT:    vpor %xmm5, %xmm4, %xmm4
1414; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
1415; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm6
1416; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
1417; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
1418; AVX512-NEXT:    vpor %xmm6, %xmm5, %xmm5
1419; AVX512-NEXT:    vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
1420; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0))
1421; AVX512-NEXT:    vextracti128 $1, %ymm6, %xmm7
1422; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
1423; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
1424; AVX512-NEXT:    vpor %xmm7, %xmm6, %xmm6
1425; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
1426; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm8
1427; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
1428; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
1429; AVX512-NEXT:    vpor %xmm7, %xmm8, %xmm7
1430; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
1431; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
1432; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
1433; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
1434; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1435; AVX512-NEXT:    vmovq %xmm2, (%rsi)
1436; AVX512-NEXT:    vmovq %xmm3, (%rdx)
1437; AVX512-NEXT:    vmovq %xmm4, (%rcx)
1438; AVX512-NEXT:    vmovq %xmm5, (%r8)
1439; AVX512-NEXT:    vmovq %xmm6, (%r9)
1440; AVX512-NEXT:    vmovq %xmm7, (%r10)
1441; AVX512-NEXT:    vmovq %xmm0, (%rax)
1442; AVX512-NEXT:    vzeroupper
1443; AVX512-NEXT:    retq
1444;
1445; AVX512-FCP-LABEL: load_i8_stride7_vf8:
1446; AVX512-FCP:       # %bb.0:
1447; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1448; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1449; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm0
1450; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
1451; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
1452; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1))
1453; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
1454; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
1455; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
1456; AVX512-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
1457; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
1458; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
1459; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
1460; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
1461; AVX512-FCP-NEXT:    vpor %xmm4, %xmm3, %xmm3
1462; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535]
1463; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1))
1464; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
1465; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
1466; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
1467; AVX512-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
1468; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
1469; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
1470; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
1471; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
1472; AVX512-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
1473; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
1474; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0))
1475; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
1476; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
1477; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
1478; AVX512-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
1479; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
1480; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
1481; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
1482; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
1483; AVX512-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
1484; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
1485; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
1486; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
1487; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
1488; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1489; AVX512-FCP-NEXT:    vmovq %xmm2, (%rsi)
1490; AVX512-FCP-NEXT:    vmovq %xmm3, (%rdx)
1491; AVX512-FCP-NEXT:    vmovq %xmm4, (%rcx)
1492; AVX512-FCP-NEXT:    vmovq %xmm5, (%r8)
1493; AVX512-FCP-NEXT:    vmovq %xmm6, (%r9)
1494; AVX512-FCP-NEXT:    vmovq %xmm7, (%r10)
1495; AVX512-FCP-NEXT:    vmovq %xmm0, (%rax)
1496; AVX512-FCP-NEXT:    vzeroupper
1497; AVX512-FCP-NEXT:    retq
1498;
1499; AVX512DQ-LABEL: load_i8_stride7_vf8:
1500; AVX512DQ:       # %bb.0:
1501; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1502; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1503; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
1504; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm1
1505; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
1506; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1))
1507; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
1508; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
1509; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
1510; AVX512DQ-NEXT:    vpor %xmm3, %xmm2, %xmm2
1511; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
1512; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm4
1513; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
1514; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
1515; AVX512DQ-NEXT:    vpor %xmm4, %xmm3, %xmm3
1516; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535]
1517; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1))
1518; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm5
1519; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
1520; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
1521; AVX512DQ-NEXT:    vpor %xmm5, %xmm4, %xmm4
1522; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
1523; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm6
1524; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
1525; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
1526; AVX512DQ-NEXT:    vpor %xmm6, %xmm5, %xmm5
1527; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
1528; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0))
1529; AVX512DQ-NEXT:    vextracti128 $1, %ymm6, %xmm7
1530; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
1531; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
1532; AVX512DQ-NEXT:    vpor %xmm7, %xmm6, %xmm6
1533; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
1534; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm8
1535; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
1536; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
1537; AVX512DQ-NEXT:    vpor %xmm7, %xmm8, %xmm7
1538; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
1539; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
1540; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
1541; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
1542; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1543; AVX512DQ-NEXT:    vmovq %xmm2, (%rsi)
1544; AVX512DQ-NEXT:    vmovq %xmm3, (%rdx)
1545; AVX512DQ-NEXT:    vmovq %xmm4, (%rcx)
1546; AVX512DQ-NEXT:    vmovq %xmm5, (%r8)
1547; AVX512DQ-NEXT:    vmovq %xmm6, (%r9)
1548; AVX512DQ-NEXT:    vmovq %xmm7, (%r10)
1549; AVX512DQ-NEXT:    vmovq %xmm0, (%rax)
1550; AVX512DQ-NEXT:    vzeroupper
1551; AVX512DQ-NEXT:    retq
1552;
1553; AVX512DQ-FCP-LABEL: load_i8_stride7_vf8:
1554; AVX512DQ-FCP:       # %bb.0:
1555; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1556; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1557; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm0
1558; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
1559; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
1560; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1))
1561; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
1562; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
1563; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
1564; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
1565; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
1566; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
1567; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
1568; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
1569; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm3, %xmm3
1570; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535]
1571; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1))
1572; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
1573; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
1574; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
1575; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
1576; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
1577; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
1578; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
1579; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
1580; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
1581; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
1582; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0))
1583; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
1584; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
1585; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
1586; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
1587; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
1588; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
1589; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
1590; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
1591; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
1592; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
1593; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
1594; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
1595; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
1596; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1597; AVX512DQ-FCP-NEXT:    vmovq %xmm2, (%rsi)
1598; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rdx)
1599; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%rcx)
1600; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%r8)
1601; AVX512DQ-FCP-NEXT:    vmovq %xmm6, (%r9)
1602; AVX512DQ-FCP-NEXT:    vmovq %xmm7, (%r10)
1603; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rax)
1604; AVX512DQ-FCP-NEXT:    vzeroupper
1605; AVX512DQ-FCP-NEXT:    retq
1606;
1607; AVX512BW-LABEL: load_i8_stride7_vf8:
1608; AVX512BW:       # %bb.0:
1609; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1610; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1611; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
1612; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm0
1613; AVX512BW-NEXT:    movw $290, %di # imm = 0x122
1614; AVX512BW-NEXT:    kmovd %edi, %k1
1615; AVX512BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm2 {%k1}
1616; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
1617; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
1618; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
1619; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
1620; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15]
1621; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm4
1622; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
1623; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
1624; AVX512BW-NEXT:    vpor %xmm4, %xmm3, %xmm3
1625; AVX512BW-NEXT:    movw $580, %di # imm = 0x244
1626; AVX512BW-NEXT:    kmovd %edi, %k1
1627; AVX512BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm4 {%k1}
1628; AVX512BW-NEXT:    vextracti128 $1, %ymm4, %xmm5
1629; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
1630; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
1631; AVX512BW-NEXT:    vpor %xmm5, %xmm4, %xmm4
1632; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7]
1633; AVX512BW-NEXT:    vextracti128 $1, %ymm5, %xmm6
1634; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
1635; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
1636; AVX512BW-NEXT:    vpor %xmm6, %xmm5, %xmm5
1637; AVX512BW-NEXT:    movw $4644, %di # imm = 0x1224
1638; AVX512BW-NEXT:    kmovd %edi, %k1
1639; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm0, %ymm6 {%k1}
1640; AVX512BW-NEXT:    vextracti128 $1, %ymm6, %xmm7
1641; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
1642; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
1643; AVX512BW-NEXT:    vpor %xmm7, %xmm6, %xmm6
1644; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15]
1645; AVX512BW-NEXT:    vextracti128 $1, %ymm7, %xmm8
1646; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
1647; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
1648; AVX512BW-NEXT:    vpor %xmm7, %xmm8, %xmm7
1649; AVX512BW-NEXT:    movw $9288, %di # imm = 0x2448
1650; AVX512BW-NEXT:    kmovd %edi, %k1
1651; AVX512BW-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
1652; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
1653; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
1654; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
1655; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1656; AVX512BW-NEXT:    vmovq %xmm2, (%rsi)
1657; AVX512BW-NEXT:    vmovq %xmm3, (%rdx)
1658; AVX512BW-NEXT:    vmovq %xmm4, (%rcx)
1659; AVX512BW-NEXT:    vmovq %xmm5, (%r8)
1660; AVX512BW-NEXT:    vmovq %xmm6, (%r9)
1661; AVX512BW-NEXT:    vmovq %xmm7, (%r10)
1662; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
1663; AVX512BW-NEXT:    vzeroupper
1664; AVX512BW-NEXT:    retq
1665;
1666; AVX512BW-FCP-LABEL: load_i8_stride7_vf8:
1667; AVX512BW-FCP:       # %bb.0:
1668; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1669; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1670; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
1671; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
1672; AVX512BW-FCP-NEXT:    movw $290, %di # imm = 0x122
1673; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
1674; AVX512BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm2 {%k1}
1675; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
1676; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
1677; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
1678; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
1679; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15]
1680; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
1681; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
1682; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
1683; AVX512BW-FCP-NEXT:    vpor %xmm4, %xmm3, %xmm3
1684; AVX512BW-FCP-NEXT:    movw $580, %di # imm = 0x244
1685; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
1686; AVX512BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm4 {%k1}
1687; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
1688; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
1689; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
1690; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
1691; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7]
1692; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
1693; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
1694; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
1695; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
1696; AVX512BW-FCP-NEXT:    movw $4644, %di # imm = 0x1224
1697; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
1698; AVX512BW-FCP-NEXT:    vpblendmw %ymm1, %ymm0, %ymm6 {%k1}
1699; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
1700; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
1701; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
1702; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
1703; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15]
1704; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
1705; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
1706; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
1707; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
1708; AVX512BW-FCP-NEXT:    movw $9288, %di # imm = 0x2448
1709; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
1710; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
1711; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
1712; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
1713; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
1714; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1715; AVX512BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
1716; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
1717; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%rcx)
1718; AVX512BW-FCP-NEXT:    vmovq %xmm5, (%r8)
1719; AVX512BW-FCP-NEXT:    vmovq %xmm6, (%r9)
1720; AVX512BW-FCP-NEXT:    vmovq %xmm7, (%r10)
1721; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rax)
1722; AVX512BW-FCP-NEXT:    vzeroupper
1723; AVX512BW-FCP-NEXT:    retq
1724;
1725; AVX512DQ-BW-LABEL: load_i8_stride7_vf8:
1726; AVX512DQ-BW:       # %bb.0:
1727; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1728; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1729; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm1
1730; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm0
1731; AVX512DQ-BW-NEXT:    movw $290, %di # imm = 0x122
1732; AVX512DQ-BW-NEXT:    kmovd %edi, %k1
1733; AVX512DQ-BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm2 {%k1}
1734; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
1735; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
1736; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
1737; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
1738; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15]
1739; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm4
1740; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
1741; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
1742; AVX512DQ-BW-NEXT:    vpor %xmm4, %xmm3, %xmm3
1743; AVX512DQ-BW-NEXT:    movw $580, %di # imm = 0x244
1744; AVX512DQ-BW-NEXT:    kmovd %edi, %k1
1745; AVX512DQ-BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm4 {%k1}
1746; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm4, %xmm5
1747; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
1748; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
1749; AVX512DQ-BW-NEXT:    vpor %xmm5, %xmm4, %xmm4
1750; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7]
1751; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm5, %xmm6
1752; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
1753; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
1754; AVX512DQ-BW-NEXT:    vpor %xmm6, %xmm5, %xmm5
1755; AVX512DQ-BW-NEXT:    movw $4644, %di # imm = 0x1224
1756; AVX512DQ-BW-NEXT:    kmovd %edi, %k1
1757; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm0, %ymm6 {%k1}
1758; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm6, %xmm7
1759; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
1760; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
1761; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm6, %xmm6
1762; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15]
1763; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm7, %xmm8
1764; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
1765; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
1766; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm8, %xmm7
1767; AVX512DQ-BW-NEXT:    movw $9288, %di # imm = 0x2448
1768; AVX512DQ-BW-NEXT:    kmovd %edi, %k1
1769; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
1770; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
1771; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
1772; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
1773; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1774; AVX512DQ-BW-NEXT:    vmovq %xmm2, (%rsi)
1775; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rdx)
1776; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%rcx)
1777; AVX512DQ-BW-NEXT:    vmovq %xmm5, (%r8)
1778; AVX512DQ-BW-NEXT:    vmovq %xmm6, (%r9)
1779; AVX512DQ-BW-NEXT:    vmovq %xmm7, (%r10)
1780; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rax)
1781; AVX512DQ-BW-NEXT:    vzeroupper
1782; AVX512DQ-BW-NEXT:    retq
1783;
1784; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf8:
1785; AVX512DQ-BW-FCP:       # %bb.0:
1786; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1787; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1788; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
1789; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
1790; AVX512DQ-BW-FCP-NEXT:    movw $290, %di # imm = 0x122
1791; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
1792; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm2 {%k1}
1793; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
1794; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
1795; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
1796; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
1797; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15]
1798; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
1799; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
1800; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
1801; AVX512DQ-BW-FCP-NEXT:    vpor %xmm4, %xmm3, %xmm3
1802; AVX512DQ-BW-FCP-NEXT:    movw $580, %di # imm = 0x244
1803; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
1804; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm4 {%k1}
1805; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
1806; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
1807; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
1808; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
1809; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7]
1810; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
1811; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
1812; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
1813; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
1814; AVX512DQ-BW-FCP-NEXT:    movw $4644, %di # imm = 0x1224
1815; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
1816; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm1, %ymm0, %ymm6 {%k1}
1817; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
1818; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
1819; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
1820; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
1821; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15]
1822; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
1823; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
1824; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
1825; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
1826; AVX512DQ-BW-FCP-NEXT:    movw $9288, %di # imm = 0x2448
1827; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
1828; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
1829; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
1830; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
1831; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
1832; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1833; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, (%rsi)
1834; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rdx)
1835; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%rcx)
1836; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm5, (%r8)
1837; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm6, (%r9)
1838; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm7, (%r10)
1839; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rax)
1840; AVX512DQ-BW-FCP-NEXT:    vzeroupper
1841; AVX512DQ-BW-FCP-NEXT:    retq
1842  %wide.vec = load <56 x i8>, ptr %in.vec, align 64
1843  %strided.vec0 = shufflevector <56 x i8> %wide.vec, <56 x i8> poison, <8 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49>
1844  %strided.vec1 = shufflevector <56 x i8> %wide.vec, <56 x i8> poison, <8 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50>
1845  %strided.vec2 = shufflevector <56 x i8> %wide.vec, <56 x i8> poison, <8 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51>
1846  %strided.vec3 = shufflevector <56 x i8> %wide.vec, <56 x i8> poison, <8 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52>
1847  %strided.vec4 = shufflevector <56 x i8> %wide.vec, <56 x i8> poison, <8 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53>
1848  %strided.vec5 = shufflevector <56 x i8> %wide.vec, <56 x i8> poison, <8 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54>
1849  %strided.vec6 = shufflevector <56 x i8> %wide.vec, <56 x i8> poison, <8 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55>
1850  store <8 x i8> %strided.vec0, ptr %out.vec0, align 64
1851  store <8 x i8> %strided.vec1, ptr %out.vec1, align 64
1852  store <8 x i8> %strided.vec2, ptr %out.vec2, align 64
1853  store <8 x i8> %strided.vec3, ptr %out.vec3, align 64
1854  store <8 x i8> %strided.vec4, ptr %out.vec4, align 64
1855  store <8 x i8> %strided.vec5, ptr %out.vec5, align 64
1856  store <8 x i8> %strided.vec6, ptr %out.vec6, align 64
1857  ret void
1858}
1859
1860define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
1861; SSE-LABEL: load_i8_stride7_vf16:
1862; SSE:       # %bb.0:
1863; SSE-NEXT:    subq $168, %rsp
1864; SSE-NEXT:    movdqa 96(%rdi), %xmm15
1865; SSE-NEXT:    movdqa 80(%rdi), %xmm4
1866; SSE-NEXT:    movdqa 64(%rdi), %xmm7
1867; SSE-NEXT:    movdqa (%rdi), %xmm6
1868; SSE-NEXT:    movdqa 16(%rdi), %xmm9
1869; SSE-NEXT:    movdqa 32(%rdi), %xmm12
1870; SSE-NEXT:    movdqa 48(%rdi), %xmm8
1871; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535]
1872; SSE-NEXT:    movdqa %xmm2, %xmm0
1873; SSE-NEXT:    pandn %xmm12, %xmm0
1874; SSE-NEXT:    movdqa %xmm8, %xmm1
1875; SSE-NEXT:    pand %xmm2, %xmm1
1876; SSE-NEXT:    por %xmm0, %xmm1
1877; SSE-NEXT:    pxor %xmm13, %xmm13
1878; SSE-NEXT:    movdqa %xmm1, %xmm0
1879; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15]
1880; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7]
1881; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1882; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
1883; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1884; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6]
1885; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
1886; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
1887; SSE-NEXT:    packuswb %xmm0, %xmm1
1888; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255]
1889; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,65535,0,65535]
1890; SSE-NEXT:    movdqa %xmm10, %xmm0
1891; SSE-NEXT:    pandn %xmm9, %xmm0
1892; SSE-NEXT:    movdqa %xmm6, %xmm3
1893; SSE-NEXT:    movdqa %xmm6, %xmm11
1894; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1895; SSE-NEXT:    pand %xmm10, %xmm3
1896; SSE-NEXT:    por %xmm0, %xmm3
1897; SSE-NEXT:    movdqa %xmm3, %xmm0
1898; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15]
1899; SSE-NEXT:    movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,0,65535,0,65535]
1900; SSE-NEXT:    movdqa %xmm14, %xmm6
1901; SSE-NEXT:    pandn %xmm0, %xmm6
1902; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3],xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7]
1903; SSE-NEXT:    pand %xmm14, %xmm3
1904; SSE-NEXT:    por %xmm6, %xmm3
1905; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,1,3]
1906; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
1907; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,1,1]
1908; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
1909; SSE-NEXT:    packuswb %xmm0, %xmm0
1910; SSE-NEXT:    pand %xmm2, %xmm0
1911; SSE-NEXT:    pandn %xmm1, %xmm2
1912; SSE-NEXT:    por %xmm2, %xmm0
1913; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,0,65535]
1914; SSE-NEXT:    movdqa %xmm3, %xmm1
1915; SSE-NEXT:    pandn %xmm7, %xmm1
1916; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1917; SSE-NEXT:    movdqa %xmm4, %xmm2
1918; SSE-NEXT:    movdqa %xmm4, %xmm5
1919; SSE-NEXT:    pand %xmm3, %xmm2
1920; SSE-NEXT:    movdqa %xmm3, %xmm13
1921; SSE-NEXT:    por %xmm1, %xmm2
1922; SSE-NEXT:    movdqa %xmm2, %xmm1
1923; SSE-NEXT:    pxor %xmm6, %xmm6
1924; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
1925; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7]
1926; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15]
1927; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
1928; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
1929; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1930; SSE-NEXT:    movdqa %xmm15, %xmm2
1931; SSE-NEXT:    movdqa %xmm15, %xmm3
1932; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15]
1933; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1934; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
1935; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1936; SSE-NEXT:    pxor %xmm15, %xmm15
1937; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1938; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
1939; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
1940; SSE-NEXT:    packuswb %xmm2, %xmm2
1941; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,65535,0]
1942; SSE-NEXT:    movdqa %xmm4, %xmm3
1943; SSE-NEXT:    pandn %xmm2, %xmm3
1944; SSE-NEXT:    packuswb %xmm1, %xmm1
1945; SSE-NEXT:    pand %xmm4, %xmm1
1946; SSE-NEXT:    por %xmm1, %xmm3
1947; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,0]
1948; SSE-NEXT:    pand %xmm1, %xmm0
1949; SSE-NEXT:    pandn %xmm3, %xmm1
1950; SSE-NEXT:    por %xmm0, %xmm1
1951; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1952; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,0,65535,65535]
1953; SSE-NEXT:    movdqa %xmm2, %xmm0
1954; SSE-NEXT:    pandn %xmm12, %xmm0
1955; SSE-NEXT:    movdqa %xmm8, %xmm1
1956; SSE-NEXT:    pand %xmm2, %xmm1
1957; SSE-NEXT:    por %xmm0, %xmm1
1958; SSE-NEXT:    movdqa %xmm1, %xmm0
1959; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15]
1960; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
1961; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
1962; SSE-NEXT:    pand %xmm2, %xmm1
1963; SSE-NEXT:    pandn %xmm0, %xmm2
1964; SSE-NEXT:    por %xmm1, %xmm2
1965; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,1,2,1]
1966; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
1967; SSE-NEXT:    psrld $16, %xmm0
1968; SSE-NEXT:    packuswb %xmm0, %xmm1
1969; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255]
1970; SSE-NEXT:    movdqa %xmm4, %xmm0
1971; SSE-NEXT:    pandn %xmm1, %xmm0
1972; SSE-NEXT:    movdqa %xmm13, %xmm1
1973; SSE-NEXT:    pandn %xmm9, %xmm1
1974; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1975; SSE-NEXT:    movdqa %xmm11, %xmm2
1976; SSE-NEXT:    pand %xmm13, %xmm2
1977; SSE-NEXT:    movdqa %xmm13, %xmm11
1978; SSE-NEXT:    por %xmm1, %xmm2
1979; SSE-NEXT:    movdqa %xmm2, %xmm1
1980; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
1981; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,0,65535]
1982; SSE-NEXT:    movdqa %xmm6, %xmm3
1983; SSE-NEXT:    pandn %xmm1, %xmm3
1984; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15]
1985; SSE-NEXT:    pand %xmm6, %xmm2
1986; SSE-NEXT:    por %xmm3, %xmm2
1987; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3]
1988; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
1989; SSE-NEXT:    pshufhw {{.*#+}} xmm13 = xmm1[0,1,2,3,5,5,5,5]
1990; SSE-NEXT:    packuswb %xmm13, %xmm13
1991; SSE-NEXT:    pand %xmm4, %xmm13
1992; SSE-NEXT:    por %xmm0, %xmm13
1993; SSE-NEXT:    movdqa %xmm10, %xmm0
1994; SSE-NEXT:    pandn %xmm5, %xmm0
1995; SSE-NEXT:    movdqa %xmm5, %xmm6
1996; SSE-NEXT:    movdqa %xmm7, %xmm1
1997; SSE-NEXT:    pand %xmm10, %xmm1
1998; SSE-NEXT:    por %xmm0, %xmm1
1999; SSE-NEXT:    movdqa %xmm1, %xmm0
2000; SSE-NEXT:    pxor %xmm2, %xmm2
2001; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
2002; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2003; SSE-NEXT:    pand %xmm14, %xmm1
2004; SSE-NEXT:    pandn %xmm0, %xmm14
2005; SSE-NEXT:    por %xmm1, %xmm14
2006; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2007; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2008; SSE-NEXT:    movdqa %xmm12, %xmm0
2009; SSE-NEXT:    pand %xmm10, %xmm0
2010; SSE-NEXT:    pandn %xmm8, %xmm10
2011; SSE-NEXT:    por %xmm0, %xmm10
2012; SSE-NEXT:    movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,65535]
2013; SSE-NEXT:    movdqa %xmm9, %xmm7
2014; SSE-NEXT:    pand %xmm14, %xmm7
2015; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2016; SSE-NEXT:    movdqa %xmm5, %xmm15
2017; SSE-NEXT:    pand %xmm14, %xmm15
2018; SSE-NEXT:    movdqa %xmm11, %xmm3
2019; SSE-NEXT:    pandn %xmm8, %xmm3
2020; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2021; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm8[0,2,2,3]
2022; SSE-NEXT:    movdqa %xmm8, %xmm4
2023; SSE-NEXT:    pand %xmm14, %xmm8
2024; SSE-NEXT:    movdqa %xmm14, %xmm9
2025; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2026; SSE-NEXT:    pandn %xmm12, %xmm14
2027; SSE-NEXT:    por %xmm8, %xmm14
2028; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2029; SSE-NEXT:    movdqa %xmm0, %xmm5
2030; SSE-NEXT:    pslld $16, %xmm5
2031; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2032; SSE-NEXT:    movdqa %xmm8, %xmm3
2033; SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
2034; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
2035; SSE-NEXT:    movdqa %xmm8, %xmm1
2036; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2037; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2038; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2039; SSE-NEXT:    pxor %xmm1, %xmm1
2040; SSE-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
2041; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,7]
2042; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
2043; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,6,4,6,5]
2044; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,65535,0,65535]
2045; SSE-NEXT:    pand %xmm12, %xmm10
2046; SSE-NEXT:    movdqa %xmm8, %xmm2
2047; SSE-NEXT:    pand %xmm12, %xmm2
2048; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2049; SSE-NEXT:    movdqa %xmm12, (%rsp) # 16-byte Spill
2050; SSE-NEXT:    pandn %xmm0, %xmm12
2051; SSE-NEXT:    movdqa %xmm0, %xmm2
2052; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2053; SSE-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3],xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7]
2054; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535]
2055; SSE-NEXT:    pand %xmm0, %xmm14
2056; SSE-NEXT:    pand %xmm0, %xmm2
2057; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2058; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2059; SSE-NEXT:    pandn %xmm8, %xmm0
2060; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2061; SSE-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
2062; SSE-NEXT:    packuswb %xmm8, %xmm5
2063; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,65535,0]
2064; SSE-NEXT:    movdqa %xmm0, %xmm8
2065; SSE-NEXT:    pandn %xmm5, %xmm8
2066; SSE-NEXT:    pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
2067; SSE-NEXT:    # xmm5 = mem[0,3,2,3]
2068; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,0,3,2,4,5,6,7]
2069; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7]
2070; SSE-NEXT:    packuswb %xmm5, %xmm5
2071; SSE-NEXT:    pand %xmm0, %xmm5
2072; SSE-NEXT:    por %xmm5, %xmm8
2073; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
2074; SSE-NEXT:    movdqa %xmm5, %xmm0
2075; SSE-NEXT:    pandn %xmm8, %xmm0
2076; SSE-NEXT:    pand %xmm5, %xmm13
2077; SSE-NEXT:    por %xmm13, %xmm0
2078; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2079; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,0,65535]
2080; SSE-NEXT:    movdqa %xmm2, %xmm8
2081; SSE-NEXT:    pandn %xmm6, %xmm8
2082; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2083; SSE-NEXT:    pand %xmm2, %xmm0
2084; SSE-NEXT:    por %xmm8, %xmm0
2085; SSE-NEXT:    movdqa %xmm0, %xmm8
2086; SSE-NEXT:    pxor %xmm6, %xmm6
2087; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
2088; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,0,65535]
2089; SSE-NEXT:    movdqa %xmm13, %xmm1
2090; SSE-NEXT:    pandn %xmm8, %xmm1
2091; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
2092; SSE-NEXT:    pxor %xmm8, %xmm8
2093; SSE-NEXT:    pand %xmm13, %xmm0
2094; SSE-NEXT:    por %xmm1, %xmm0
2095; SSE-NEXT:    packuswb %xmm3, %xmm1
2096; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0]
2097; SSE-NEXT:    movdqa %xmm6, %xmm3
2098; SSE-NEXT:    pandn %xmm1, %xmm3
2099; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
2100; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
2101; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2102; SSE-NEXT:    packuswb %xmm0, %xmm0
2103; SSE-NEXT:    pand %xmm6, %xmm0
2104; SSE-NEXT:    por %xmm0, %xmm3
2105; SSE-NEXT:    movdqa %xmm5, %xmm0
2106; SSE-NEXT:    pandn %xmm3, %xmm0
2107; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2108; SSE-NEXT:    pandn %xmm6, %xmm9
2109; SSE-NEXT:    por %xmm9, %xmm7
2110; SSE-NEXT:    movdqa %xmm7, %xmm1
2111; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
2112; SSE-NEXT:    movdqa %xmm13, %xmm3
2113; SSE-NEXT:    pandn %xmm1, %xmm3
2114; SSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
2115; SSE-NEXT:    pand %xmm13, %xmm7
2116; SSE-NEXT:    por %xmm3, %xmm7
2117; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
2118; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm9[1,3,2,3]
2119; SSE-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1]
2120; SSE-NEXT:    movdqa %xmm11, %xmm1
2121; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
2122; SSE-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7]
2123; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535]
2124; SSE-NEXT:    pand %xmm3, %xmm11
2125; SSE-NEXT:    pandn %xmm1, %xmm3
2126; SSE-NEXT:    por %xmm11, %xmm3
2127; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
2128; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
2129; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
2130; SSE-NEXT:    packuswb %xmm1, %xmm3
2131; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255]
2132; SSE-NEXT:    movdqa %xmm13, %xmm8
2133; SSE-NEXT:    pandn %xmm3, %xmm8
2134; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[0,2,1,3]
2135; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
2136; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,3,3]
2137; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7]
2138; SSE-NEXT:    packuswb %xmm1, %xmm1
2139; SSE-NEXT:    pand %xmm13, %xmm1
2140; SSE-NEXT:    por %xmm1, %xmm8
2141; SSE-NEXT:    pand %xmm5, %xmm8
2142; SSE-NEXT:    por %xmm0, %xmm8
2143; SSE-NEXT:    movdqa %xmm2, %xmm0
2144; SSE-NEXT:    pandn %xmm9, %xmm0
2145; SSE-NEXT:    pand %xmm2, %xmm4
2146; SSE-NEXT:    por %xmm0, %xmm4
2147; SSE-NEXT:    movdqa %xmm4, %xmm0
2148; SSE-NEXT:    pxor %xmm1, %xmm1
2149; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
2150; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
2151; SSE-NEXT:    pxor %xmm2, %xmm2
2152; SSE-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
2153; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7]
2154; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
2155; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,7,6]
2156; SSE-NEXT:    psrlq $48, %xmm0
2157; SSE-NEXT:    packuswb %xmm0, %xmm3
2158; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,0,65535,65535]
2159; SSE-NEXT:    movdqa %xmm1, %xmm0
2160; SSE-NEXT:    movdqa %xmm6, %xmm7
2161; SSE-NEXT:    pandn %xmm6, %xmm0
2162; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
2163; SSE-NEXT:    movdqa %xmm9, %xmm4
2164; SSE-NEXT:    pand %xmm1, %xmm4
2165; SSE-NEXT:    por %xmm0, %xmm4
2166; SSE-NEXT:    movdqa %xmm4, %xmm0
2167; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2168; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,65535,65535,65535]
2169; SSE-NEXT:    movdqa %xmm1, %xmm6
2170; SSE-NEXT:    pandn %xmm0, %xmm6
2171; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
2172; SSE-NEXT:    pand %xmm1, %xmm4
2173; SSE-NEXT:    por %xmm6, %xmm4
2174; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm4[3,2,1,0,4,5,6,7]
2175; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,7,7,7,7]
2176; SSE-NEXT:    packuswb %xmm4, %xmm4
2177; SSE-NEXT:    pand %xmm13, %xmm4
2178; SSE-NEXT:    pandn %xmm3, %xmm13
2179; SSE-NEXT:    por %xmm13, %xmm4
2180; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2181; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2182; SSE-NEXT:    pandn %xmm6, %xmm0
2183; SSE-NEXT:    por %xmm0, %xmm15
2184; SSE-NEXT:    movdqa %xmm15, %xmm0
2185; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
2186; SSE-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3],xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7]
2187; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535]
2188; SSE-NEXT:    pand %xmm3, %xmm15
2189; SSE-NEXT:    pandn %xmm0, %xmm3
2190; SSE-NEXT:    por %xmm15, %xmm3
2191; SSE-NEXT:    movdqa %xmm3, %xmm11
2192; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
2193; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm13[2,1,2,3,4,5,6,7]
2194; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2195; SSE-NEXT:    packuswb %xmm0, %xmm0
2196; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,65535,0]
2197; SSE-NEXT:    movdqa %xmm2, %xmm3
2198; SSE-NEXT:    pandn %xmm0, %xmm3
2199; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm11[0,2,1,0,4,5,6,7]
2200; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
2201; SSE-NEXT:    packuswb %xmm0, %xmm0
2202; SSE-NEXT:    pand %xmm2, %xmm0
2203; SSE-NEXT:    por %xmm0, %xmm3
2204; SSE-NEXT:    movdqa %xmm5, %xmm15
2205; SSE-NEXT:    pandn %xmm3, %xmm15
2206; SSE-NEXT:    pand %xmm5, %xmm4
2207; SSE-NEXT:    por %xmm4, %xmm15
2208; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535]
2209; SSE-NEXT:    movdqa %xmm0, %xmm3
2210; SSE-NEXT:    pandn %xmm7, %xmm3
2211; SSE-NEXT:    movdqa %xmm9, %xmm4
2212; SSE-NEXT:    pand %xmm0, %xmm4
2213; SSE-NEXT:    por %xmm3, %xmm4
2214; SSE-NEXT:    movdqa %xmm4, %xmm3
2215; SSE-NEXT:    pxor %xmm0, %xmm0
2216; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2217; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
2218; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
2219; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
2220; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
2221; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
2222; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2223; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2224; SSE-NEXT:    movdqa (%rsp), %xmm3 # 16-byte Reload
2225; SSE-NEXT:    pandn %xmm2, %xmm3
2226; SSE-NEXT:    por %xmm3, %xmm10
2227; SSE-NEXT:    packuswb %xmm2, %xmm10
2228; SSE-NEXT:    packuswb %xmm4, %xmm4
2229; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm10[0,1,3,3]
2230; SSE-NEXT:    movss {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3]
2231; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
2232; SSE-NEXT:    movdqa %xmm7, %xmm3
2233; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,65535,0,65535,65535]
2234; SSE-NEXT:    pand %xmm4, %xmm3
2235; SSE-NEXT:    pandn %xmm6, %xmm4
2236; SSE-NEXT:    movdqa %xmm6, %xmm11
2237; SSE-NEXT:    por %xmm3, %xmm4
2238; SSE-NEXT:    movdqa %xmm4, %xmm3
2239; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2240; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
2241; SSE-NEXT:    pxor %xmm10, %xmm10
2242; SSE-NEXT:    pand %xmm1, %xmm4
2243; SSE-NEXT:    pandn %xmm3, %xmm1
2244; SSE-NEXT:    por %xmm4, %xmm1
2245; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
2246; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
2247; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
2248; SSE-NEXT:    packuswb %xmm1, %xmm1
2249; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,65535,0]
2250; SSE-NEXT:    pand %xmm0, %xmm1
2251; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm13[0,1,2,1]
2252; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
2253; SSE-NEXT:    packuswb %xmm3, %xmm3
2254; SSE-NEXT:    pandn %xmm3, %xmm0
2255; SSE-NEXT:    por %xmm1, %xmm0
2256; SSE-NEXT:    movdqa %xmm5, %xmm1
2257; SSE-NEXT:    pandn %xmm0, %xmm1
2258; SSE-NEXT:    andps %xmm5, %xmm2
2259; SSE-NEXT:    por %xmm2, %xmm1
2260; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2261; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,65535,65535,0,65535]
2262; SSE-NEXT:    pand %xmm13, %xmm2
2263; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2264; SSE-NEXT:    movdqa %xmm2, %xmm3
2265; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
2266; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
2267; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,65535,0]
2268; SSE-NEXT:    pand %xmm4, %xmm2
2269; SSE-NEXT:    pandn %xmm3, %xmm4
2270; SSE-NEXT:    por %xmm2, %xmm4
2271; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3]
2272; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,5,4,7,6]
2273; SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2274; SSE-NEXT:    packuswb %xmm3, %xmm6
2275; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm9[0,2,2,3]
2276; SSE-NEXT:    pand %xmm13, %xmm9
2277; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2278; SSE-NEXT:    pandn %xmm3, %xmm13
2279; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
2280; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2281; SSE-NEXT:    movdqa %xmm2, %xmm3
2282; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
2283; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
2284; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
2285; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
2286; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
2287; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
2288; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
2289; SSE-NEXT:    packuswb %xmm2, %xmm2
2290; SSE-NEXT:    movss {{.*#+}} xmm6 = xmm2[0],xmm6[1,2,3]
2291; SSE-NEXT:    movdqa %xmm7, %xmm2
2292; SSE-NEXT:    movdqa %xmm7, %xmm0
2293; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535]
2294; SSE-NEXT:    pand %xmm3, %xmm2
2295; SSE-NEXT:    pandn %xmm11, %xmm3
2296; SSE-NEXT:    por %xmm2, %xmm3
2297; SSE-NEXT:    movdqa %xmm3, %xmm2
2298; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
2299; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
2300; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,0,65535,65535,65535]
2301; SSE-NEXT:    pand %xmm4, %xmm3
2302; SSE-NEXT:    pandn %xmm2, %xmm4
2303; SSE-NEXT:    por %xmm3, %xmm4
2304; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
2305; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm12[0,1,0,3]
2306; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6]
2307; SSE-NEXT:    packuswb %xmm2, %xmm2
2308; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0]
2309; SSE-NEXT:    movdqa %xmm3, %xmm7
2310; SSE-NEXT:    pandn %xmm2, %xmm7
2311; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[2,1,0,3]
2312; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7]
2313; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
2314; SSE-NEXT:    packuswb %xmm2, %xmm2
2315; SSE-NEXT:    pand %xmm3, %xmm2
2316; SSE-NEXT:    por %xmm2, %xmm7
2317; SSE-NEXT:    movdqa %xmm5, %xmm2
2318; SSE-NEXT:    pandn %xmm7, %xmm2
2319; SSE-NEXT:    andps %xmm5, %xmm6
2320; SSE-NEXT:    por %xmm6, %xmm2
2321; SSE-NEXT:    movdqa %xmm13, %xmm7
2322; SSE-NEXT:    por %xmm9, %xmm7
2323; SSE-NEXT:    movdqa %xmm7, %xmm4
2324; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
2325; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,4,6,7]
2326; SSE-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15]
2327; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,1,2,1]
2328; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
2329; SSE-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
2330; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2331; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
2332; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
2333; SSE-NEXT:    pandn %xmm4, %xmm9
2334; SSE-NEXT:    movdqa %xmm4, %xmm7
2335; SSE-NEXT:    por %xmm9, %xmm14
2336; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm14[0,2,1,3]
2337; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
2338; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0]
2339; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5]
2340; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[3,3,3,3]
2341; SSE-NEXT:    packuswb %xmm7, %xmm4
2342; SSE-NEXT:    packuswb %xmm6, %xmm6
2343; SSE-NEXT:    movss {{.*#+}} xmm4 = xmm6[0],xmm4[1,2,3]
2344; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
2345; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
2346; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm11[1,3,2,3]
2347; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[0,2,2,3]
2348; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
2349; SSE-NEXT:    movdqa %xmm7, %xmm6
2350; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
2351; SSE-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15]
2352; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,0,65535,65535]
2353; SSE-NEXT:    pand %xmm9, %xmm7
2354; SSE-NEXT:    pandn %xmm6, %xmm9
2355; SSE-NEXT:    por %xmm7, %xmm9
2356; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm9[2,1,1,1]
2357; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7]
2358; SSE-NEXT:    packuswb %xmm6, %xmm6
2359; SSE-NEXT:    pand %xmm3, %xmm6
2360; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm12[0,1,0,3]
2361; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
2362; SSE-NEXT:    packuswb %xmm7, %xmm7
2363; SSE-NEXT:    pandn %xmm7, %xmm3
2364; SSE-NEXT:    por %xmm3, %xmm6
2365; SSE-NEXT:    andps %xmm5, %xmm4
2366; SSE-NEXT:    pandn %xmm6, %xmm5
2367; SSE-NEXT:    por %xmm4, %xmm5
2368; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2369; SSE-NEXT:    movaps %xmm3, (%rsi)
2370; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2371; SSE-NEXT:    movaps %xmm0, (%rdx)
2372; SSE-NEXT:    movdqa %xmm8, (%rcx)
2373; SSE-NEXT:    movdqa %xmm15, (%r8)
2374; SSE-NEXT:    movdqa %xmm1, (%r9)
2375; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2376; SSE-NEXT:    movdqa %xmm2, (%rax)
2377; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2378; SSE-NEXT:    movdqa %xmm5, (%rax)
2379; SSE-NEXT:    addq $168, %rsp
2380; SSE-NEXT:    retq
2381;
2382; AVX-LABEL: load_i8_stride7_vf16:
2383; AVX:       # %bb.0:
2384; AVX-NEXT:    vmovdqa (%rdi), %xmm2
2385; AVX-NEXT:    vmovdqa 16(%rdi), %xmm7
2386; AVX-NEXT:    vmovdqa 32(%rdi), %xmm3
2387; AVX-NEXT:    vmovdqa 48(%rdi), %xmm4
2388; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u,u,u,u,u]
2389; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u]
2390; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
2391; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,3,10],zero,zero,zero,xmm3[u,u,u,u,u,u]
2392; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
2393; AVX-NEXT:    vpor %xmm1, %xmm5, %xmm1
2394; AVX-NEXT:    vmovq {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
2395; AVX-NEXT:    vpblendvb %xmm5, %xmm0, %xmm1, %xmm0
2396; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2397; AVX-NEXT:    vmovdqa 80(%rdi), %xmm1
2398; AVX-NEXT:    vmovdqa 64(%rdi), %xmm5
2399; AVX-NEXT:    vmovdqa 96(%rdi), %xmm6
2400; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u]
2401; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = xmm2[1,8,15],zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u]
2402; AVX-NEXT:    vpor %xmm8, %xmm9, %xmm8
2403; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u,u,u]
2404; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[u,u,u,u,u,u,u]
2405; AVX-NEXT:    vpor %xmm9, %xmm10, %xmm9
2406; AVX-NEXT:    vmovq {{.*#+}} xmm11 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
2407; AVX-NEXT:    vpblendvb %xmm11, %xmm8, %xmm9, %xmm8
2408; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u]
2409; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u]
2410; AVX-NEXT:    vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
2411; AVX-NEXT:    vpxor %xmm12, %xmm12, %xmm12
2412; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm12[7]
2413; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[3,10]
2414; AVX-NEXT:    vpor %xmm10, %xmm9, %xmm10
2415; AVX-NEXT:    vpmovsxwq {{.*#+}} xmm9 = [18446744073709551615,255]
2416; AVX-NEXT:    vpblendvb %xmm9, %xmm8, %xmm10, %xmm0
2417; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2418; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u]
2419; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
2420; AVX-NEXT:    vpor %xmm10, %xmm13, %xmm10
2421; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u],zero,zero,xmm4[3,10,u,u,u,u,u,u,u]
2422; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[u,u,u,u,u,u,u]
2423; AVX-NEXT:    vpor %xmm13, %xmm14, %xmm13
2424; AVX-NEXT:    vpblendvb %xmm11, %xmm10, %xmm13, %xmm10
2425; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm1[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u]
2426; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15]
2427; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm13 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
2428; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6],xmm12[7]
2429; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[4,11]
2430; AVX-NEXT:    vpor %xmm14, %xmm13, %xmm13
2431; AVX-NEXT:    vpblendvb %xmm9, %xmm10, %xmm13, %xmm10
2432; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u]
2433; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm7[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
2434; AVX-NEXT:    vpor %xmm13, %xmm14, %xmm13
2435; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u],zero,zero,xmm4[4,11,u,u,u,u,u,u,u]
2436; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[u,u,u,u,u,u,u]
2437; AVX-NEXT:    vpor %xmm14, %xmm15, %xmm14
2438; AVX-NEXT:    vpblendvb %xmm11, %xmm13, %xmm14, %xmm11
2439; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm5[u,u]
2440; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14,u,u]
2441; AVX-NEXT:    vpor %xmm13, %xmm14, %xmm13
2442; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6],xmm12[7]
2443; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[5,12]
2444; AVX-NEXT:    vpor %xmm14, %xmm13, %xmm13
2445; AVX-NEXT:    vpblendvb %xmm9, %xmm11, %xmm13, %xmm11
2446; AVX-NEXT:    vmovd {{.*#+}} xmm13 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
2447; AVX-NEXT:    vpshufb %xmm13, %xmm2, %xmm14
2448; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = xmm7[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2449; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
2450; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u]
2451; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u]
2452; AVX-NEXT:    vpor %xmm0, %xmm15, %xmm0
2453; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3,4,5,6,7]
2454; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm5[u,u]
2455; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15,u,u]
2456; AVX-NEXT:    vpor %xmm14, %xmm15, %xmm14
2457; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3,4,5,6],xmm12[7]
2458; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[6,13]
2459; AVX-NEXT:    vpor %xmm14, %xmm12, %xmm12
2460; AVX-NEXT:    vpblendvb %xmm9, %xmm0, %xmm12, %xmm12
2461; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm7[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2462; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm2[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2463; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
2464; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u],zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u]
2465; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u]
2466; AVX-NEXT:    vpor %xmm14, %xmm15, %xmm14
2467; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3,4,5,6,7]
2468; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[2,9,u,u,u]
2469; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm5[u,u,u]
2470; AVX-NEXT:    vpor %xmm14, %xmm15, %xmm14
2471; AVX-NEXT:    vmovddup {{.*#+}} xmm15 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128]
2472; AVX-NEXT:    # xmm15 = mem[0,0]
2473; AVX-NEXT:    vpshufb %xmm15, %xmm14, %xmm14
2474; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[0,7,14]
2475; AVX-NEXT:    vpor %xmm8, %xmm14, %xmm8
2476; AVX-NEXT:    vpblendvb %xmm9, %xmm0, %xmm8, %xmm0
2477; AVX-NEXT:    vpshufb %xmm13, %xmm7, %xmm7
2478; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2479; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
2480; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
2481; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u]
2482; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
2483; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7]
2484; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[3,10,u,u,u]
2485; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm5[u,u,u]
2486; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
2487; AVX-NEXT:    vpshufb %xmm15, %xmm3, %xmm3
2488; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[1,8,15]
2489; AVX-NEXT:    vpor %xmm4, %xmm3, %xmm3
2490; AVX-NEXT:    vpblendvb %xmm9, %xmm2, %xmm3, %xmm2
2491; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,4,11,u,u,u,u,u,u,u,u,u,u]
2492; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
2493; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2494; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9]
2495; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,0,1,2,3,8,15]
2496; AVX-NEXT:    vpblendw $31, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2497; AVX-NEXT:    # xmm1 = mem[0,1,2,3,4],xmm1[5,6,7]
2498; AVX-NEXT:    vmovdqa %xmm1, (%rsi)
2499; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2500; AVX-NEXT:    vmovaps %xmm1, (%rdx)
2501; AVX-NEXT:    vmovdqa %xmm10, (%rcx)
2502; AVX-NEXT:    vmovdqa %xmm11, (%r8)
2503; AVX-NEXT:    vmovdqa %xmm12, (%r9)
2504; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2505; AVX-NEXT:    vmovdqa %xmm0, (%rax)
2506; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2507; AVX-NEXT:    vmovdqa %xmm2, (%rax)
2508; AVX-NEXT:    retq
2509;
2510; AVX2-LABEL: load_i8_stride7_vf16:
2511; AVX2:       # %bb.0:
2512; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2513; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
2514; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
2515; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
2516; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
2517; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
2518; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
2519; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u]
2520; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u]
2521; AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm3
2522; AVX2-NEXT:    vmovdqa 96(%rdi), %xmm9
2523; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm10
2524; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm9[0],xmm10[1],xmm9[2],xmm10[3]
2525; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9]
2526; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm2
2527; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero
2528; AVX2-NEXT:    vpor %xmm5, %xmm4, %xmm4
2529; AVX2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7]
2530; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
2531; AVX2-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm4
2532; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
2533; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u]
2534; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u]
2535; AVX2-NEXT:    vpor %xmm5, %xmm4, %xmm6
2536; AVX2-NEXT:    vmovdqa 96(%rdi), %xmm4
2537; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm5
2538; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3,4],xmm4[5],xmm5[6,7]
2539; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10]
2540; AVX2-NEXT:    vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero
2541; AVX2-NEXT:    vpor %xmm7, %xmm8, %xmm8
2542; AVX2-NEXT:    vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255]
2543; AVX2-NEXT:    vpblendvb %xmm7, %xmm6, %xmm8, %xmm6
2544; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
2545; AVX2-NEXT:    vpblendvb %ymm8, %ymm0, %ymm1, %ymm8
2546; AVX2-NEXT:    vpshufb {{.*#+}} xmm11 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u]
2547; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm8
2548; AVX2-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u]
2549; AVX2-NEXT:    vpor %xmm11, %xmm8, %xmm8
2550; AVX2-NEXT:    vpblendw {{.*#+}} xmm11 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
2551; AVX2-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11]
2552; AVX2-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13],zero,zero
2553; AVX2-NEXT:    vpor %xmm12, %xmm11, %xmm11
2554; AVX2-NEXT:    vpblendvb %xmm7, %xmm8, %xmm11, %xmm8
2555; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535]
2556; AVX2-NEXT:    vpblendvb %ymm11, %ymm0, %ymm1, %ymm11
2557; AVX2-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u,u,u]
2558; AVX2-NEXT:    vextracti128 $1, %ymm11, %xmm11
2559; AVX2-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u,u,u]
2560; AVX2-NEXT:    vpor %xmm12, %xmm11, %xmm11
2561; AVX2-NEXT:    vpblendd {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
2562; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm10[5,12]
2563; AVX2-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero
2564; AVX2-NEXT:    vpor %xmm12, %xmm9, %xmm9
2565; AVX2-NEXT:    vpblendvb %xmm7, %xmm11, %xmm9, %xmm9
2566; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
2567; AVX2-NEXT:    vpblendvb %ymm11, %ymm1, %ymm0, %ymm11
2568; AVX2-NEXT:    vextracti128 $1, %ymm11, %xmm12
2569; AVX2-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u]
2570; AVX2-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u,u,u]
2571; AVX2-NEXT:    vpor %xmm12, %xmm11, %xmm11
2572; AVX2-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13]
2573; AVX2-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero
2574; AVX2-NEXT:    vpor %xmm12, %xmm10, %xmm10
2575; AVX2-NEXT:    vpblendvb %xmm7, %xmm11, %xmm10, %xmm10
2576; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
2577; AVX2-NEXT:    vpblendvb %ymm11, %ymm1, %ymm0, %ymm11
2578; AVX2-NEXT:    vextracti128 $1, %ymm11, %xmm12
2579; AVX2-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u]
2580; AVX2-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u,u,u,u]
2581; AVX2-NEXT:    vpor %xmm12, %xmm11, %xmm11
2582; AVX2-NEXT:    vpblendw {{.*#+}} xmm12 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6],xmm4[7]
2583; AVX2-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm12[0,7,14]
2584; AVX2-NEXT:    vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[2,9],zero,zero,zero
2585; AVX2-NEXT:    vpor %xmm13, %xmm12, %xmm12
2586; AVX2-NEXT:    vpblendvb %xmm7, %xmm11, %xmm12, %xmm11
2587; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
2588; AVX2-NEXT:    vpblendvb %ymm12, %ymm1, %ymm0, %ymm0
2589; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u]
2590; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
2591; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u]
2592; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
2593; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6],xmm4[7]
2594; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm1[1,8,15]
2595; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[3,10],zero,zero,zero
2596; AVX2-NEXT:    vpor %xmm2, %xmm1, %xmm1
2597; AVX2-NEXT:    vpblendvb %xmm7, %xmm0, %xmm1, %xmm0
2598; AVX2-NEXT:    vmovdqa %xmm3, (%rsi)
2599; AVX2-NEXT:    vmovdqa %xmm6, (%rdx)
2600; AVX2-NEXT:    vmovdqa %xmm8, (%rcx)
2601; AVX2-NEXT:    vmovdqa %xmm9, (%r8)
2602; AVX2-NEXT:    vmovdqa %xmm10, (%r9)
2603; AVX2-NEXT:    vmovdqa %xmm11, (%r10)
2604; AVX2-NEXT:    vmovdqa %xmm0, (%rax)
2605; AVX2-NEXT:    vzeroupper
2606; AVX2-NEXT:    retq
2607;
2608; AVX2-FP-LABEL: load_i8_stride7_vf16:
2609; AVX2-FP:       # %bb.0:
2610; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2611; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
2612; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm0
2613; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm1
2614; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
2615; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
2616; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm3
2617; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u]
2618; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u]
2619; AVX2-FP-NEXT:    vpor %xmm3, %xmm2, %xmm3
2620; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %xmm9
2621; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm10
2622; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm9[0],xmm10[1],xmm9[2],xmm10[3]
2623; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9]
2624; AVX2-FP-NEXT:    vmovdqa 80(%rdi), %xmm2
2625; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero
2626; AVX2-FP-NEXT:    vpor %xmm5, %xmm4, %xmm4
2627; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7]
2628; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
2629; AVX2-FP-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm4
2630; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm5
2631; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u]
2632; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u]
2633; AVX2-FP-NEXT:    vpor %xmm5, %xmm4, %xmm6
2634; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %xmm4
2635; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm5
2636; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3,4],xmm4[5],xmm5[6,7]
2637; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10]
2638; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero
2639; AVX2-FP-NEXT:    vpor %xmm7, %xmm8, %xmm8
2640; AVX2-FP-NEXT:    vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255]
2641; AVX2-FP-NEXT:    vpblendvb %xmm7, %xmm6, %xmm8, %xmm6
2642; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
2643; AVX2-FP-NEXT:    vpblendvb %ymm8, %ymm0, %ymm1, %ymm8
2644; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u]
2645; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm8
2646; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u]
2647; AVX2-FP-NEXT:    vpor %xmm11, %xmm8, %xmm8
2648; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
2649; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11]
2650; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13],zero,zero
2651; AVX2-FP-NEXT:    vpor %xmm12, %xmm11, %xmm11
2652; AVX2-FP-NEXT:    vpblendvb %xmm7, %xmm8, %xmm11, %xmm8
2653; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535]
2654; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm0, %ymm1, %ymm11
2655; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u,u,u]
2656; AVX2-FP-NEXT:    vextracti128 $1, %ymm11, %xmm11
2657; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u,u,u]
2658; AVX2-FP-NEXT:    vpor %xmm12, %xmm11, %xmm11
2659; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
2660; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm10[5,12]
2661; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero
2662; AVX2-FP-NEXT:    vpor %xmm12, %xmm9, %xmm9
2663; AVX2-FP-NEXT:    vpblendvb %xmm7, %xmm11, %xmm9, %xmm9
2664; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
2665; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm1, %ymm0, %ymm11
2666; AVX2-FP-NEXT:    vextracti128 $1, %ymm11, %xmm12
2667; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u]
2668; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u,u,u]
2669; AVX2-FP-NEXT:    vpor %xmm12, %xmm11, %xmm11
2670; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13]
2671; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero
2672; AVX2-FP-NEXT:    vpor %xmm12, %xmm10, %xmm10
2673; AVX2-FP-NEXT:    vpblendvb %xmm7, %xmm11, %xmm10, %xmm10
2674; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
2675; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm1, %ymm0, %ymm11
2676; AVX2-FP-NEXT:    vextracti128 $1, %ymm11, %xmm12
2677; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u]
2678; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u,u,u,u]
2679; AVX2-FP-NEXT:    vpor %xmm12, %xmm11, %xmm11
2680; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6],xmm4[7]
2681; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm12[0,7,14]
2682; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[2,9],zero,zero,zero
2683; AVX2-FP-NEXT:    vpor %xmm13, %xmm12, %xmm12
2684; AVX2-FP-NEXT:    vpblendvb %xmm7, %xmm11, %xmm12, %xmm11
2685; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
2686; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm1, %ymm0, %ymm0
2687; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u]
2688; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm0
2689; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u]
2690; AVX2-FP-NEXT:    vpor %xmm1, %xmm0, %xmm0
2691; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6],xmm4[7]
2692; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm1[1,8,15]
2693; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[3,10],zero,zero,zero
2694; AVX2-FP-NEXT:    vpor %xmm2, %xmm1, %xmm1
2695; AVX2-FP-NEXT:    vpblendvb %xmm7, %xmm0, %xmm1, %xmm0
2696; AVX2-FP-NEXT:    vmovdqa %xmm3, (%rsi)
2697; AVX2-FP-NEXT:    vmovdqa %xmm6, (%rdx)
2698; AVX2-FP-NEXT:    vmovdqa %xmm8, (%rcx)
2699; AVX2-FP-NEXT:    vmovdqa %xmm9, (%r8)
2700; AVX2-FP-NEXT:    vmovdqa %xmm10, (%r9)
2701; AVX2-FP-NEXT:    vmovdqa %xmm11, (%r10)
2702; AVX2-FP-NEXT:    vmovdqa %xmm0, (%rax)
2703; AVX2-FP-NEXT:    vzeroupper
2704; AVX2-FP-NEXT:    retq
2705;
2706; AVX2-FCP-LABEL: load_i8_stride7_vf16:
2707; AVX2-FCP:       # %bb.0:
2708; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2709; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
2710; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm0
2711; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
2712; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
2713; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
2714; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
2715; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u]
2716; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u]
2717; AVX2-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm3
2718; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %xmm9
2719; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm10
2720; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm2 = xmm9[0],xmm10[1],xmm9[2],xmm10[3]
2721; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9]
2722; AVX2-FCP-NEXT:    vmovdqa 80(%rdi), %xmm2
2723; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero
2724; AVX2-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
2725; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7]
2726; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
2727; AVX2-FCP-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm4
2728; AVX2-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
2729; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u]
2730; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u]
2731; AVX2-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm6
2732; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %xmm4
2733; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm5
2734; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3,4],xmm4[5],xmm5[6,7]
2735; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10]
2736; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero
2737; AVX2-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm8
2738; AVX2-FCP-NEXT:    vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255]
2739; AVX2-FCP-NEXT:    vpblendvb %xmm7, %xmm6, %xmm8, %xmm6
2740; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
2741; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm0, %ymm1, %ymm8
2742; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u]
2743; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
2744; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u]
2745; AVX2-FCP-NEXT:    vpor %xmm11, %xmm8, %xmm8
2746; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
2747; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11]
2748; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13],zero,zero
2749; AVX2-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
2750; AVX2-FCP-NEXT:    vpblendvb %xmm7, %xmm8, %xmm11, %xmm8
2751; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535]
2752; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm0, %ymm1, %ymm11
2753; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u,u,u]
2754; AVX2-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
2755; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u,u,u]
2756; AVX2-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
2757; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
2758; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm10[5,12]
2759; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero
2760; AVX2-FCP-NEXT:    vpor %xmm12, %xmm9, %xmm9
2761; AVX2-FCP-NEXT:    vpblendvb %xmm7, %xmm11, %xmm9, %xmm9
2762; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
2763; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm1, %ymm0, %ymm11
2764; AVX2-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
2765; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u]
2766; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u,u,u]
2767; AVX2-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
2768; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13]
2769; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero
2770; AVX2-FCP-NEXT:    vpor %xmm12, %xmm10, %xmm10
2771; AVX2-FCP-NEXT:    vpblendvb %xmm7, %xmm11, %xmm10, %xmm10
2772; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
2773; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm1, %ymm0, %ymm11
2774; AVX2-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
2775; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u]
2776; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u,u,u,u]
2777; AVX2-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
2778; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm12 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6],xmm4[7]
2779; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm12[0,7,14]
2780; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[2,9],zero,zero,zero
2781; AVX2-FCP-NEXT:    vpor %xmm13, %xmm12, %xmm12
2782; AVX2-FCP-NEXT:    vpblendvb %xmm7, %xmm11, %xmm12, %xmm11
2783; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
2784; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm1, %ymm0, %ymm0
2785; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u]
2786; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
2787; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u]
2788; AVX2-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
2789; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6],xmm4[7]
2790; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm1[1,8,15]
2791; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[3,10],zero,zero,zero
2792; AVX2-FCP-NEXT:    vpor %xmm2, %xmm1, %xmm1
2793; AVX2-FCP-NEXT:    vpblendvb %xmm7, %xmm0, %xmm1, %xmm0
2794; AVX2-FCP-NEXT:    vmovdqa %xmm3, (%rsi)
2795; AVX2-FCP-NEXT:    vmovdqa %xmm6, (%rdx)
2796; AVX2-FCP-NEXT:    vmovdqa %xmm8, (%rcx)
2797; AVX2-FCP-NEXT:    vmovdqa %xmm9, (%r8)
2798; AVX2-FCP-NEXT:    vmovdqa %xmm10, (%r9)
2799; AVX2-FCP-NEXT:    vmovdqa %xmm11, (%r10)
2800; AVX2-FCP-NEXT:    vmovdqa %xmm0, (%rax)
2801; AVX2-FCP-NEXT:    vzeroupper
2802; AVX2-FCP-NEXT:    retq
2803;
2804; AVX512-LABEL: load_i8_stride7_vf16:
2805; AVX512:       # %bb.0:
2806; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2807; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
2808; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128]
2809; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm0
2810; AVX512-NEXT:    vpshufb %xmm3, %xmm0, %xmm4
2811; AVX512-NEXT:    vmovdqa (%rdi), %ymm1
2812; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm2
2813; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
2814; AVX512-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm2 ^ (ymm5 & (ymm1 ^ ymm2))
2815; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm6
2816; AVX512-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
2817; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7]
2818; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm3
2819; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm4
2820; AVX512-NEXT:    vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
2821; AVX512-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9]
2822; AVX512-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
2823; AVX512-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
2824; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7]
2825; AVX512-NEXT:    vpor %xmm6, %xmm5, %xmm5
2826; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
2827; AVX512-NEXT:    vmovdqa %ymm8, %ymm6
2828; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm1 ^ ymm2))
2829; AVX512-NEXT:    vextracti128 $1, %ymm6, %xmm7
2830; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u]
2831; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u]
2832; AVX512-NEXT:    vpor %xmm7, %xmm6, %xmm9
2833; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7]
2834; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10]
2835; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero
2836; AVX512-NEXT:    vpor %xmm7, %xmm6, %xmm6
2837; AVX512-NEXT:    vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255]
2838; AVX512-NEXT:    vpternlogq {{.*#+}} xmm6 = xmm6 ^ (xmm7 & (xmm6 ^ xmm9))
2839; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
2840; AVX512-NEXT:    vmovdqa %ymm9, %ymm10
2841; AVX512-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm2 ^ (ymm10 & (ymm1 ^ ymm2))
2842; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u,u,u]
2843; AVX512-NEXT:    vextracti128 $1, %ymm10, %xmm10
2844; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u,u,u]
2845; AVX512-NEXT:    vpor %xmm11, %xmm10, %xmm10
2846; AVX512-NEXT:    vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
2847; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11]
2848; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero
2849; AVX512-NEXT:    vpor %xmm12, %xmm11, %xmm11
2850; AVX512-NEXT:    vpternlogq {{.*#+}} xmm11 = xmm11 ^ (xmm7 & (xmm11 ^ xmm10))
2851; AVX512-NEXT:    vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
2852; AVX512-NEXT:    vmovdqa %ymm10, %ymm12
2853; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm2 ^ (ymm12 & (ymm1 ^ ymm2))
2854; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u]
2855; AVX512-NEXT:    vextracti128 $1, %ymm12, %xmm12
2856; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u]
2857; AVX512-NEXT:    vpor %xmm13, %xmm12, %xmm12
2858; AVX512-NEXT:    vpblendd {{.*#+}} xmm13 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
2859; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm13[5,12]
2860; AVX512-NEXT:    vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero
2861; AVX512-NEXT:    vpor %xmm15, %xmm14, %xmm14
2862; AVX512-NEXT:    vpternlogq {{.*#+}} xmm14 = xmm14 ^ (xmm7 & (xmm14 ^ xmm12))
2863; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm2 ^ ymm1))
2864; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm12
2865; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u]
2866; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u]
2867; AVX512-NEXT:    vpor %xmm12, %xmm8, %xmm8
2868; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm13[6,13]
2869; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero
2870; AVX512-NEXT:    vpor %xmm13, %xmm12, %xmm12
2871; AVX512-NEXT:    vpternlogq {{.*#+}} xmm12 = xmm12 ^ (xmm7 & (xmm12 ^ xmm8))
2872; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm2 ^ ymm1))
2873; AVX512-NEXT:    vextracti128 $1, %ymm9, %xmm8
2874; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u]
2875; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u]
2876; AVX512-NEXT:    vpor %xmm8, %xmm9, %xmm8
2877; AVX512-NEXT:    vpblendw {{.*#+}} xmm9 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7]
2878; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm9[0,7,14]
2879; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero
2880; AVX512-NEXT:    vpor %xmm13, %xmm9, %xmm9
2881; AVX512-NEXT:    vpternlogq {{.*#+}} xmm9 = xmm9 ^ (xmm7 & (xmm9 ^ xmm8))
2882; AVX512-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm1 ^ (ymm10 & (ymm2 ^ ymm1))
2883; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u]
2884; AVX512-NEXT:    vextracti128 $1, %ymm10, %xmm2
2885; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
2886; AVX512-NEXT:    vpor %xmm1, %xmm2, %xmm1
2887; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
2888; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15]
2889; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero
2890; AVX512-NEXT:    vpor %xmm0, %xmm2, %xmm0
2891; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm0 ^ (xmm7 & (xmm0 ^ xmm1))
2892; AVX512-NEXT:    vmovdqa %xmm5, (%rsi)
2893; AVX512-NEXT:    vmovdqa %xmm6, (%rdx)
2894; AVX512-NEXT:    vmovdqa %xmm11, (%rcx)
2895; AVX512-NEXT:    vmovdqa %xmm14, (%r8)
2896; AVX512-NEXT:    vmovdqa %xmm12, (%r9)
2897; AVX512-NEXT:    vmovdqa %xmm9, (%r10)
2898; AVX512-NEXT:    vmovdqa %xmm0, (%rax)
2899; AVX512-NEXT:    vzeroupper
2900; AVX512-NEXT:    retq
2901;
2902; AVX512-FCP-LABEL: load_i8_stride7_vf16:
2903; AVX512-FCP:       # %bb.0:
2904; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2905; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
2906; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128]
2907; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm0
2908; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm4
2909; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm1
2910; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
2911; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
2912; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm2 ^ (ymm5 & (ymm1 ^ ymm2))
2913; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
2914; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
2915; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7]
2916; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm3
2917; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
2918; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
2919; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9]
2920; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
2921; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
2922; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7]
2923; AVX512-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
2924; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
2925; AVX512-FCP-NEXT:    vmovdqa %ymm8, %ymm6
2926; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm1 ^ ymm2))
2927; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
2928; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u]
2929; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u]
2930; AVX512-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm9
2931; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7]
2932; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10]
2933; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero
2934; AVX512-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
2935; AVX512-FCP-NEXT:    vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255]
2936; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} xmm6 = xmm6 ^ (xmm7 & (xmm6 ^ xmm9))
2937; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
2938; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm10
2939; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm2 ^ (ymm10 & (ymm1 ^ ymm2))
2940; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u,u,u]
2941; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm10
2942; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u,u,u]
2943; AVX512-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
2944; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
2945; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11]
2946; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero
2947; AVX512-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
2948; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} xmm11 = xmm11 ^ (xmm7 & (xmm11 ^ xmm10))
2949; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
2950; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm12
2951; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm2 ^ (ymm12 & (ymm1 ^ ymm2))
2952; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u]
2953; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm12
2954; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u]
2955; AVX512-FCP-NEXT:    vpor %xmm13, %xmm12, %xmm12
2956; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
2957; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm13[5,12]
2958; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero
2959; AVX512-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
2960; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} xmm14 = xmm14 ^ (xmm7 & (xmm14 ^ xmm12))
2961; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm2 ^ ymm1))
2962; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm12
2963; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u]
2964; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u]
2965; AVX512-FCP-NEXT:    vpor %xmm12, %xmm8, %xmm8
2966; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm13[6,13]
2967; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero
2968; AVX512-FCP-NEXT:    vpor %xmm13, %xmm12, %xmm12
2969; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} xmm12 = xmm12 ^ (xmm7 & (xmm12 ^ xmm8))
2970; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm2 ^ ymm1))
2971; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm8
2972; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u]
2973; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u]
2974; AVX512-FCP-NEXT:    vpor %xmm8, %xmm9, %xmm8
2975; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7]
2976; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm9[0,7,14]
2977; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero
2978; AVX512-FCP-NEXT:    vpor %xmm13, %xmm9, %xmm9
2979; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} xmm9 = xmm9 ^ (xmm7 & (xmm9 ^ xmm8))
2980; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm1 ^ (ymm10 & (ymm2 ^ ymm1))
2981; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u]
2982; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm2
2983; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
2984; AVX512-FCP-NEXT:    vpor %xmm1, %xmm2, %xmm1
2985; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
2986; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15]
2987; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero
2988; AVX512-FCP-NEXT:    vpor %xmm0, %xmm2, %xmm0
2989; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm0 ^ (xmm7 & (xmm0 ^ xmm1))
2990; AVX512-FCP-NEXT:    vmovdqa %xmm5, (%rsi)
2991; AVX512-FCP-NEXT:    vmovdqa %xmm6, (%rdx)
2992; AVX512-FCP-NEXT:    vmovdqa %xmm11, (%rcx)
2993; AVX512-FCP-NEXT:    vmovdqa %xmm14, (%r8)
2994; AVX512-FCP-NEXT:    vmovdqa %xmm12, (%r9)
2995; AVX512-FCP-NEXT:    vmovdqa %xmm9, (%r10)
2996; AVX512-FCP-NEXT:    vmovdqa %xmm0, (%rax)
2997; AVX512-FCP-NEXT:    vzeroupper
2998; AVX512-FCP-NEXT:    retq
2999;
3000; AVX512DQ-LABEL: load_i8_stride7_vf16:
3001; AVX512DQ:       # %bb.0:
3002; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3003; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
3004; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128]
3005; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm0
3006; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm0, %xmm4
3007; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm1
3008; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm2
3009; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
3010; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm2 ^ (ymm5 & (ymm1 ^ ymm2))
3011; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm6
3012; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
3013; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7]
3014; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm3
3015; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm4
3016; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
3017; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9]
3018; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
3019; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
3020; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7]
3021; AVX512DQ-NEXT:    vpor %xmm6, %xmm5, %xmm5
3022; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
3023; AVX512DQ-NEXT:    vmovdqa %ymm8, %ymm6
3024; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm1 ^ ymm2))
3025; AVX512DQ-NEXT:    vextracti128 $1, %ymm6, %xmm7
3026; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u]
3027; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u]
3028; AVX512DQ-NEXT:    vpor %xmm7, %xmm6, %xmm9
3029; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7]
3030; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10]
3031; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero
3032; AVX512DQ-NEXT:    vpor %xmm7, %xmm6, %xmm6
3033; AVX512DQ-NEXT:    vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255]
3034; AVX512DQ-NEXT:    vpternlogq {{.*#+}} xmm6 = xmm6 ^ (xmm7 & (xmm6 ^ xmm9))
3035; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
3036; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm10
3037; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm2 ^ (ymm10 & (ymm1 ^ ymm2))
3038; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u,u,u]
3039; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm10
3040; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u,u,u]
3041; AVX512DQ-NEXT:    vpor %xmm11, %xmm10, %xmm10
3042; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
3043; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11]
3044; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero
3045; AVX512DQ-NEXT:    vpor %xmm12, %xmm11, %xmm11
3046; AVX512DQ-NEXT:    vpternlogq {{.*#+}} xmm11 = xmm11 ^ (xmm7 & (xmm11 ^ xmm10))
3047; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
3048; AVX512DQ-NEXT:    vmovdqa %ymm10, %ymm12
3049; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm2 ^ (ymm12 & (ymm1 ^ ymm2))
3050; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u]
3051; AVX512DQ-NEXT:    vextracti128 $1, %ymm12, %xmm12
3052; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u]
3053; AVX512DQ-NEXT:    vpor %xmm13, %xmm12, %xmm12
3054; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm13 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
3055; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm13[5,12]
3056; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero
3057; AVX512DQ-NEXT:    vpor %xmm15, %xmm14, %xmm14
3058; AVX512DQ-NEXT:    vpternlogq {{.*#+}} xmm14 = xmm14 ^ (xmm7 & (xmm14 ^ xmm12))
3059; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm2 ^ ymm1))
3060; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm12
3061; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u]
3062; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u]
3063; AVX512DQ-NEXT:    vpor %xmm12, %xmm8, %xmm8
3064; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm13[6,13]
3065; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero
3066; AVX512DQ-NEXT:    vpor %xmm13, %xmm12, %xmm12
3067; AVX512DQ-NEXT:    vpternlogq {{.*#+}} xmm12 = xmm12 ^ (xmm7 & (xmm12 ^ xmm8))
3068; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm2 ^ ymm1))
3069; AVX512DQ-NEXT:    vextracti128 $1, %ymm9, %xmm8
3070; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u]
3071; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u]
3072; AVX512DQ-NEXT:    vpor %xmm8, %xmm9, %xmm8
3073; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm9 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7]
3074; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm9[0,7,14]
3075; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero
3076; AVX512DQ-NEXT:    vpor %xmm13, %xmm9, %xmm9
3077; AVX512DQ-NEXT:    vpternlogq {{.*#+}} xmm9 = xmm9 ^ (xmm7 & (xmm9 ^ xmm8))
3078; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm1 ^ (ymm10 & (ymm2 ^ ymm1))
3079; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u]
3080; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm2
3081; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
3082; AVX512DQ-NEXT:    vpor %xmm1, %xmm2, %xmm1
3083; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
3084; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15]
3085; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero
3086; AVX512DQ-NEXT:    vpor %xmm0, %xmm2, %xmm0
3087; AVX512DQ-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm0 ^ (xmm7 & (xmm0 ^ xmm1))
3088; AVX512DQ-NEXT:    vmovdqa %xmm5, (%rsi)
3089; AVX512DQ-NEXT:    vmovdqa %xmm6, (%rdx)
3090; AVX512DQ-NEXT:    vmovdqa %xmm11, (%rcx)
3091; AVX512DQ-NEXT:    vmovdqa %xmm14, (%r8)
3092; AVX512DQ-NEXT:    vmovdqa %xmm12, (%r9)
3093; AVX512DQ-NEXT:    vmovdqa %xmm9, (%r10)
3094; AVX512DQ-NEXT:    vmovdqa %xmm0, (%rax)
3095; AVX512DQ-NEXT:    vzeroupper
3096; AVX512DQ-NEXT:    retq
3097;
3098; AVX512DQ-FCP-LABEL: load_i8_stride7_vf16:
3099; AVX512DQ-FCP:       # %bb.0:
3100; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3101; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
3102; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128]
3103; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdi), %xmm0
3104; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm4
3105; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm1
3106; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
3107; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
3108; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm2 ^ (ymm5 & (ymm1 ^ ymm2))
3109; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
3110; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
3111; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7]
3112; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm3
3113; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
3114; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
3115; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9]
3116; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
3117; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
3118; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7]
3119; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
3120; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
3121; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, %ymm6
3122; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm1 ^ ymm2))
3123; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
3124; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u]
3125; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u]
3126; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm9
3127; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7]
3128; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10]
3129; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero
3130; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
3131; AVX512DQ-FCP-NEXT:    vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255]
3132; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} xmm6 = xmm6 ^ (xmm7 & (xmm6 ^ xmm9))
3133; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
3134; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm10
3135; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm2 ^ (ymm10 & (ymm1 ^ ymm2))
3136; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u,u,u]
3137; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm10
3138; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u,u,u]
3139; AVX512DQ-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
3140; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
3141; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11]
3142; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero
3143; AVX512DQ-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
3144; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} xmm11 = xmm11 ^ (xmm7 & (xmm11 ^ xmm10))
3145; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
3146; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm12
3147; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm2 ^ (ymm12 & (ymm1 ^ ymm2))
3148; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u]
3149; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm12
3150; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u]
3151; AVX512DQ-FCP-NEXT:    vpor %xmm13, %xmm12, %xmm12
3152; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm13 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
3153; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm13[5,12]
3154; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero
3155; AVX512DQ-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
3156; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} xmm14 = xmm14 ^ (xmm7 & (xmm14 ^ xmm12))
3157; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm2 ^ ymm1))
3158; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm12
3159; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u]
3160; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u]
3161; AVX512DQ-FCP-NEXT:    vpor %xmm12, %xmm8, %xmm8
3162; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm13[6,13]
3163; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero
3164; AVX512DQ-FCP-NEXT:    vpor %xmm13, %xmm12, %xmm12
3165; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} xmm12 = xmm12 ^ (xmm7 & (xmm12 ^ xmm8))
3166; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm1 ^ (ymm9 & (ymm2 ^ ymm1))
3167; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm8
3168; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u]
3169; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u]
3170; AVX512DQ-FCP-NEXT:    vpor %xmm8, %xmm9, %xmm8
3171; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7]
3172; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm9[0,7,14]
3173; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero
3174; AVX512DQ-FCP-NEXT:    vpor %xmm13, %xmm9, %xmm9
3175; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} xmm9 = xmm9 ^ (xmm7 & (xmm9 ^ xmm8))
3176; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm1 ^ (ymm10 & (ymm2 ^ ymm1))
3177; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u]
3178; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm2
3179; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
3180; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm2, %xmm1
3181; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
3182; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15]
3183; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero
3184; AVX512DQ-FCP-NEXT:    vpor %xmm0, %xmm2, %xmm0
3185; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm0 ^ (xmm7 & (xmm0 ^ xmm1))
3186; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, (%rsi)
3187; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, (%rdx)
3188; AVX512DQ-FCP-NEXT:    vmovdqa %xmm11, (%rcx)
3189; AVX512DQ-FCP-NEXT:    vmovdqa %xmm14, (%r8)
3190; AVX512DQ-FCP-NEXT:    vmovdqa %xmm12, (%r9)
3191; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, (%r10)
3192; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%rax)
3193; AVX512DQ-FCP-NEXT:    vzeroupper
3194; AVX512DQ-FCP-NEXT:    retq
3195;
3196; AVX512BW-LABEL: load_i8_stride7_vf16:
3197; AVX512BW:       # %bb.0:
3198; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3199; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
3200; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128]
3201; AVX512BW-NEXT:    vmovdqa 80(%rdi), %xmm0
3202; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm4
3203; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm2
3204; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm1
3205; AVX512BW-NEXT:    movw $-28382, %r11w # imm = 0x9122
3206; AVX512BW-NEXT:    kmovd %r11d, %k1
3207; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm2, %ymm5 {%k1}
3208; AVX512BW-NEXT:    vextracti128 $1, %ymm5, %xmm6
3209; AVX512BW-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
3210; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7]
3211; AVX512BW-NEXT:    vmovdqa 96(%rdi), %xmm3
3212; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm4
3213; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
3214; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9]
3215; AVX512BW-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
3216; AVX512BW-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
3217; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7]
3218; AVX512BW-NEXT:    vpor %xmm6, %xmm5, %xmm5
3219; AVX512BW-NEXT:    movw $4644, %di # imm = 0x1224
3220; AVX512BW-NEXT:    kmovd %edi, %k2
3221; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm2, %ymm6 {%k2}
3222; AVX512BW-NEXT:    vextracti128 $1, %ymm6, %xmm7
3223; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u]
3224; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u]
3225; AVX512BW-NEXT:    vpor %xmm7, %xmm6, %xmm6
3226; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7]
3227; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10]
3228; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero
3229; AVX512BW-NEXT:    vpor %xmm7, %xmm8, %xmm7
3230; AVX512BW-NEXT:    movw $-512, %di # imm = 0xFE00
3231; AVX512BW-NEXT:    kmovd %edi, %k1
3232; AVX512BW-NEXT:    vmovdqu8 %xmm7, %xmm6 {%k1}
3233; AVX512BW-NEXT:    movw $8772, %di # imm = 0x2244
3234; AVX512BW-NEXT:    kmovd %edi, %k3
3235; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm2, %ymm7 {%k3}
3236; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u]
3237; AVX512BW-NEXT:    vextracti128 $1, %ymm7, %xmm7
3238; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u]
3239; AVX512BW-NEXT:    vpor %xmm7, %xmm8, %xmm7
3240; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm8 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
3241; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11]
3242; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero
3243; AVX512BW-NEXT:    vpor %xmm9, %xmm8, %xmm8
3244; AVX512BW-NEXT:    vmovdqu8 %xmm8, %xmm7 {%k1}
3245; AVX512BW-NEXT:    movw $9288, %di # imm = 0x2448
3246; AVX512BW-NEXT:    kmovd %edi, %k4
3247; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm2, %ymm8 {%k4}
3248; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u]
3249; AVX512BW-NEXT:    vextracti128 $1, %ymm8, %xmm8
3250; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u]
3251; AVX512BW-NEXT:    vpor %xmm9, %xmm8, %xmm8
3252; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
3253; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12]
3254; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero
3255; AVX512BW-NEXT:    vpor %xmm11, %xmm10, %xmm10
3256; AVX512BW-NEXT:    vmovdqu8 %xmm10, %xmm8 {%k1}
3257; AVX512BW-NEXT:    vpblendmw %ymm2, %ymm1, %ymm10 {%k2}
3258; AVX512BW-NEXT:    vextracti128 $1, %ymm10, %xmm11
3259; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u]
3260; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u]
3261; AVX512BW-NEXT:    vpor %xmm11, %xmm10, %xmm10
3262; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13]
3263; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero
3264; AVX512BW-NEXT:    vpor %xmm11, %xmm9, %xmm9
3265; AVX512BW-NEXT:    vmovdqu8 %xmm9, %xmm10 {%k1}
3266; AVX512BW-NEXT:    vpblendmw %ymm2, %ymm1, %ymm9 {%k3}
3267; AVX512BW-NEXT:    vextracti128 $1, %ymm9, %xmm11
3268; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u]
3269; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u]
3270; AVX512BW-NEXT:    vpor %xmm11, %xmm9, %xmm9
3271; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7]
3272; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14]
3273; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero
3274; AVX512BW-NEXT:    vpor %xmm12, %xmm11, %xmm11
3275; AVX512BW-NEXT:    vmovdqu8 %xmm11, %xmm9 {%k1}
3276; AVX512BW-NEXT:    vmovdqu16 %ymm2, %ymm1 {%k4}
3277; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
3278; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm1
3279; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
3280; AVX512BW-NEXT:    vpor %xmm2, %xmm1, %xmm1
3281; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
3282; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15]
3283; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero
3284; AVX512BW-NEXT:    vpor %xmm0, %xmm2, %xmm0
3285; AVX512BW-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
3286; AVX512BW-NEXT:    vmovdqa %xmm5, (%rsi)
3287; AVX512BW-NEXT:    vmovdqa %xmm6, (%rdx)
3288; AVX512BW-NEXT:    vmovdqa %xmm7, (%rcx)
3289; AVX512BW-NEXT:    vmovdqa %xmm8, (%r8)
3290; AVX512BW-NEXT:    vmovdqa %xmm10, (%r9)
3291; AVX512BW-NEXT:    vmovdqa %xmm9, (%r10)
3292; AVX512BW-NEXT:    vmovdqa %xmm1, (%rax)
3293; AVX512BW-NEXT:    vzeroupper
3294; AVX512BW-NEXT:    retq
3295;
3296; AVX512BW-FCP-LABEL: load_i8_stride7_vf16:
3297; AVX512BW-FCP:       # %bb.0:
3298; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3299; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
3300; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128]
3301; AVX512BW-FCP-NEXT:    vmovdqa 80(%rdi), %xmm0
3302; AVX512BW-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm4
3303; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm2
3304; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
3305; AVX512BW-FCP-NEXT:    movw $-28382, %r11w # imm = 0x9122
3306; AVX512BW-FCP-NEXT:    kmovd %r11d, %k1
3307; AVX512BW-FCP-NEXT:    vpblendmw %ymm1, %ymm2, %ymm5 {%k1}
3308; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
3309; AVX512BW-FCP-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
3310; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7]
3311; AVX512BW-FCP-NEXT:    vmovdqa 96(%rdi), %xmm3
3312; AVX512BW-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
3313; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
3314; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9]
3315; AVX512BW-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
3316; AVX512BW-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
3317; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7]
3318; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
3319; AVX512BW-FCP-NEXT:    movw $4644, %di # imm = 0x1224
3320; AVX512BW-FCP-NEXT:    kmovd %edi, %k2
3321; AVX512BW-FCP-NEXT:    vpblendmw %ymm1, %ymm2, %ymm6 {%k2}
3322; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
3323; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u]
3324; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u]
3325; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
3326; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7]
3327; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10]
3328; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero
3329; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
3330; AVX512BW-FCP-NEXT:    movw $-512, %di # imm = 0xFE00
3331; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
3332; AVX512BW-FCP-NEXT:    vmovdqu8 %xmm7, %xmm6 {%k1}
3333; AVX512BW-FCP-NEXT:    movw $8772, %di # imm = 0x2244
3334; AVX512BW-FCP-NEXT:    kmovd %edi, %k3
3335; AVX512BW-FCP-NEXT:    vpblendmw %ymm1, %ymm2, %ymm7 {%k3}
3336; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u]
3337; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
3338; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u]
3339; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
3340; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
3341; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11]
3342; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero
3343; AVX512BW-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm8
3344; AVX512BW-FCP-NEXT:    vmovdqu8 %xmm8, %xmm7 {%k1}
3345; AVX512BW-FCP-NEXT:    movw $9288, %di # imm = 0x2448
3346; AVX512BW-FCP-NEXT:    kmovd %edi, %k4
3347; AVX512BW-FCP-NEXT:    vpblendmw %ymm1, %ymm2, %ymm8 {%k4}
3348; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u]
3349; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
3350; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u]
3351; AVX512BW-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm8
3352; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
3353; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12]
3354; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero
3355; AVX512BW-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
3356; AVX512BW-FCP-NEXT:    vmovdqu8 %xmm10, %xmm8 {%k1}
3357; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm1, %ymm10 {%k2}
3358; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
3359; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u]
3360; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u]
3361; AVX512BW-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
3362; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13]
3363; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero
3364; AVX512BW-FCP-NEXT:    vpor %xmm11, %xmm9, %xmm9
3365; AVX512BW-FCP-NEXT:    vmovdqu8 %xmm9, %xmm10 {%k1}
3366; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm1, %ymm9 {%k3}
3367; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm11
3368; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u]
3369; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u]
3370; AVX512BW-FCP-NEXT:    vpor %xmm11, %xmm9, %xmm9
3371; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7]
3372; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14]
3373; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero
3374; AVX512BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
3375; AVX512BW-FCP-NEXT:    vmovdqu8 %xmm11, %xmm9 {%k1}
3376; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm2, %ymm1 {%k4}
3377; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
3378; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
3379; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
3380; AVX512BW-FCP-NEXT:    vpor %xmm2, %xmm1, %xmm1
3381; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
3382; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15]
3383; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero
3384; AVX512BW-FCP-NEXT:    vpor %xmm0, %xmm2, %xmm0
3385; AVX512BW-FCP-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
3386; AVX512BW-FCP-NEXT:    vmovdqa %xmm5, (%rsi)
3387; AVX512BW-FCP-NEXT:    vmovdqa %xmm6, (%rdx)
3388; AVX512BW-FCP-NEXT:    vmovdqa %xmm7, (%rcx)
3389; AVX512BW-FCP-NEXT:    vmovdqa %xmm8, (%r8)
3390; AVX512BW-FCP-NEXT:    vmovdqa %xmm10, (%r9)
3391; AVX512BW-FCP-NEXT:    vmovdqa %xmm9, (%r10)
3392; AVX512BW-FCP-NEXT:    vmovdqa %xmm1, (%rax)
3393; AVX512BW-FCP-NEXT:    vzeroupper
3394; AVX512BW-FCP-NEXT:    retq
3395;
3396; AVX512DQ-BW-LABEL: load_i8_stride7_vf16:
3397; AVX512DQ-BW:       # %bb.0:
3398; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3399; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
3400; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128]
3401; AVX512DQ-BW-NEXT:    vmovdqa 80(%rdi), %xmm0
3402; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm4
3403; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm2
3404; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm1
3405; AVX512DQ-BW-NEXT:    movw $-28382, %r11w # imm = 0x9122
3406; AVX512DQ-BW-NEXT:    kmovd %r11d, %k1
3407; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm2, %ymm5 {%k1}
3408; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm5, %xmm6
3409; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
3410; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7]
3411; AVX512DQ-BW-NEXT:    vmovdqa 96(%rdi), %xmm3
3412; AVX512DQ-BW-NEXT:    vmovdqa 64(%rdi), %xmm4
3413; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
3414; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9]
3415; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
3416; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
3417; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7]
3418; AVX512DQ-BW-NEXT:    vpor %xmm6, %xmm5, %xmm5
3419; AVX512DQ-BW-NEXT:    movw $4644, %di # imm = 0x1224
3420; AVX512DQ-BW-NEXT:    kmovd %edi, %k2
3421; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm2, %ymm6 {%k2}
3422; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm6, %xmm7
3423; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u]
3424; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u]
3425; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm6, %xmm6
3426; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7]
3427; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10]
3428; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero
3429; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm8, %xmm7
3430; AVX512DQ-BW-NEXT:    movw $-512, %di # imm = 0xFE00
3431; AVX512DQ-BW-NEXT:    kmovd %edi, %k1
3432; AVX512DQ-BW-NEXT:    vmovdqu8 %xmm7, %xmm6 {%k1}
3433; AVX512DQ-BW-NEXT:    movw $8772, %di # imm = 0x2244
3434; AVX512DQ-BW-NEXT:    kmovd %edi, %k3
3435; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm2, %ymm7 {%k3}
3436; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u]
3437; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm7, %xmm7
3438; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u]
3439; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm8, %xmm7
3440; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm8 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
3441; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11]
3442; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero
3443; AVX512DQ-BW-NEXT:    vpor %xmm9, %xmm8, %xmm8
3444; AVX512DQ-BW-NEXT:    vmovdqu8 %xmm8, %xmm7 {%k1}
3445; AVX512DQ-BW-NEXT:    movw $9288, %di # imm = 0x2448
3446; AVX512DQ-BW-NEXT:    kmovd %edi, %k4
3447; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm2, %ymm8 {%k4}
3448; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u]
3449; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm8, %xmm8
3450; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u]
3451; AVX512DQ-BW-NEXT:    vpor %xmm9, %xmm8, %xmm8
3452; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
3453; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12]
3454; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero
3455; AVX512DQ-BW-NEXT:    vpor %xmm11, %xmm10, %xmm10
3456; AVX512DQ-BW-NEXT:    vmovdqu8 %xmm10, %xmm8 {%k1}
3457; AVX512DQ-BW-NEXT:    vpblendmw %ymm2, %ymm1, %ymm10 {%k2}
3458; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm10, %xmm11
3459; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u]
3460; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u]
3461; AVX512DQ-BW-NEXT:    vpor %xmm11, %xmm10, %xmm10
3462; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13]
3463; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero
3464; AVX512DQ-BW-NEXT:    vpor %xmm11, %xmm9, %xmm9
3465; AVX512DQ-BW-NEXT:    vmovdqu8 %xmm9, %xmm10 {%k1}
3466; AVX512DQ-BW-NEXT:    vpblendmw %ymm2, %ymm1, %ymm9 {%k3}
3467; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm9, %xmm11
3468; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u]
3469; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u]
3470; AVX512DQ-BW-NEXT:    vpor %xmm11, %xmm9, %xmm9
3471; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7]
3472; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14]
3473; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero
3474; AVX512DQ-BW-NEXT:    vpor %xmm12, %xmm11, %xmm11
3475; AVX512DQ-BW-NEXT:    vmovdqu8 %xmm11, %xmm9 {%k1}
3476; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm2, %ymm1 {%k4}
3477; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
3478; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm1, %xmm1
3479; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
3480; AVX512DQ-BW-NEXT:    vpor %xmm2, %xmm1, %xmm1
3481; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
3482; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15]
3483; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero
3484; AVX512DQ-BW-NEXT:    vpor %xmm0, %xmm2, %xmm0
3485; AVX512DQ-BW-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
3486; AVX512DQ-BW-NEXT:    vmovdqa %xmm5, (%rsi)
3487; AVX512DQ-BW-NEXT:    vmovdqa %xmm6, (%rdx)
3488; AVX512DQ-BW-NEXT:    vmovdqa %xmm7, (%rcx)
3489; AVX512DQ-BW-NEXT:    vmovdqa %xmm8, (%r8)
3490; AVX512DQ-BW-NEXT:    vmovdqa %xmm10, (%r9)
3491; AVX512DQ-BW-NEXT:    vmovdqa %xmm9, (%r10)
3492; AVX512DQ-BW-NEXT:    vmovdqa %xmm1, (%rax)
3493; AVX512DQ-BW-NEXT:    vzeroupper
3494; AVX512DQ-BW-NEXT:    retq
3495;
3496; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf16:
3497; AVX512DQ-BW-FCP:       # %bb.0:
3498; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3499; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
3500; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128]
3501; AVX512DQ-BW-FCP-NEXT:    vmovdqa 80(%rdi), %xmm0
3502; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm4
3503; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm2
3504; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
3505; AVX512DQ-BW-FCP-NEXT:    movw $-28382, %r11w # imm = 0x9122
3506; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k1
3507; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm1, %ymm2, %ymm5 {%k1}
3508; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
3509; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
3510; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7]
3511; AVX512DQ-BW-FCP-NEXT:    vmovdqa 96(%rdi), %xmm3
3512; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
3513; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
3514; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9]
3515; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
3516; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm5
3517; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7]
3518; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
3519; AVX512DQ-BW-FCP-NEXT:    movw $4644, %di # imm = 0x1224
3520; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k2
3521; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm1, %ymm2, %ymm6 {%k2}
3522; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
3523; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u]
3524; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u]
3525; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
3526; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7]
3527; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10]
3528; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero
3529; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
3530; AVX512DQ-BW-FCP-NEXT:    movw $-512, %di # imm = 0xFE00
3531; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
3532; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %xmm7, %xmm6 {%k1}
3533; AVX512DQ-BW-FCP-NEXT:    movw $8772, %di # imm = 0x2244
3534; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k3
3535; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm1, %ymm2, %ymm7 {%k3}
3536; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u]
3537; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
3538; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u]
3539; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
3540; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm8 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
3541; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11]
3542; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero
3543; AVX512DQ-BW-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm8
3544; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %xmm8, %xmm7 {%k1}
3545; AVX512DQ-BW-FCP-NEXT:    movw $9288, %di # imm = 0x2448
3546; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k4
3547; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm1, %ymm2, %ymm8 {%k4}
3548; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u]
3549; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
3550; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u]
3551; AVX512DQ-BW-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm8
3552; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
3553; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12]
3554; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero
3555; AVX512DQ-BW-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
3556; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %xmm10, %xmm8 {%k1}
3557; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm1, %ymm10 {%k2}
3558; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm11
3559; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u]
3560; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u]
3561; AVX512DQ-BW-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
3562; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13]
3563; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero
3564; AVX512DQ-BW-FCP-NEXT:    vpor %xmm11, %xmm9, %xmm9
3565; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %xmm9, %xmm10 {%k1}
3566; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm1, %ymm9 {%k3}
3567; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm11
3568; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u]
3569; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u]
3570; AVX512DQ-BW-FCP-NEXT:    vpor %xmm11, %xmm9, %xmm9
3571; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7]
3572; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14]
3573; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero
3574; AVX512DQ-BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
3575; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %xmm11, %xmm9 {%k1}
3576; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm2, %ymm1 {%k4}
3577; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
3578; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
3579; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
3580; AVX512DQ-BW-FCP-NEXT:    vpor %xmm2, %xmm1, %xmm1
3581; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
3582; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15]
3583; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero
3584; AVX512DQ-BW-FCP-NEXT:    vpor %xmm0, %xmm2, %xmm0
3585; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
3586; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm5, (%rsi)
3587; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm6, (%rdx)
3588; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm7, (%rcx)
3589; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm8, (%r8)
3590; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm10, (%r9)
3591; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm9, (%r10)
3592; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm1, (%rax)
3593; AVX512DQ-BW-FCP-NEXT:    vzeroupper
3594; AVX512DQ-BW-FCP-NEXT:    retq
3595  %wide.vec = load <112 x i8>, ptr %in.vec, align 64
3596  %strided.vec0 = shufflevector <112 x i8> %wide.vec, <112 x i8> poison, <16 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105>
3597  %strided.vec1 = shufflevector <112 x i8> %wide.vec, <112 x i8> poison, <16 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106>
3598  %strided.vec2 = shufflevector <112 x i8> %wide.vec, <112 x i8> poison, <16 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107>
3599  %strided.vec3 = shufflevector <112 x i8> %wide.vec, <112 x i8> poison, <16 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108>
3600  %strided.vec4 = shufflevector <112 x i8> %wide.vec, <112 x i8> poison, <16 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109>
3601  %strided.vec5 = shufflevector <112 x i8> %wide.vec, <112 x i8> poison, <16 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110>
3602  %strided.vec6 = shufflevector <112 x i8> %wide.vec, <112 x i8> poison, <16 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111>
3603  store <16 x i8> %strided.vec0, ptr %out.vec0, align 64
3604  store <16 x i8> %strided.vec1, ptr %out.vec1, align 64
3605  store <16 x i8> %strided.vec2, ptr %out.vec2, align 64
3606  store <16 x i8> %strided.vec3, ptr %out.vec3, align 64
3607  store <16 x i8> %strided.vec4, ptr %out.vec4, align 64
3608  store <16 x i8> %strided.vec5, ptr %out.vec5, align 64
3609  store <16 x i8> %strided.vec6, ptr %out.vec6, align 64
3610  ret void
3611}
3612
3613define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
3614; SSE-LABEL: load_i8_stride7_vf32:
3615; SSE:       # %bb.0:
3616; SSE-NEXT:    subq $648, %rsp # imm = 0x288
3617; SSE-NEXT:    movdqa 208(%rdi), %xmm14
3618; SSE-NEXT:    movdqa 192(%rdi), %xmm5
3619; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3620; SSE-NEXT:    movdqa 176(%rdi), %xmm13
3621; SSE-NEXT:    movdqa 112(%rdi), %xmm4
3622; SSE-NEXT:    movdqa 128(%rdi), %xmm11
3623; SSE-NEXT:    movdqa 160(%rdi), %xmm7
3624; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3625; SSE-NEXT:    movdqa 144(%rdi), %xmm1
3626; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3627; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535]
3628; SSE-NEXT:    movdqa %xmm2, %xmm0
3629; SSE-NEXT:    pandn %xmm1, %xmm0
3630; SSE-NEXT:    movdqa %xmm7, %xmm1
3631; SSE-NEXT:    pand %xmm2, %xmm1
3632; SSE-NEXT:    movdqa %xmm2, %xmm9
3633; SSE-NEXT:    por %xmm0, %xmm1
3634; SSE-NEXT:    pxor %xmm10, %xmm10
3635; SSE-NEXT:    movdqa %xmm1, %xmm0
3636; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15]
3637; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
3638; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3639; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
3640; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
3641; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,5,6]
3642; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
3643; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
3644; SSE-NEXT:    packuswb %xmm0, %xmm2
3645; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255]
3646; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,65535,65535,0,65535]
3647; SSE-NEXT:    movdqa %xmm7, %xmm1
3648; SSE-NEXT:    pandn %xmm11, %xmm1
3649; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3650; SSE-NEXT:    movdqa %xmm4, %xmm3
3651; SSE-NEXT:    movdqa %xmm4, %xmm12
3652; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3653; SSE-NEXT:    pand %xmm7, %xmm3
3654; SSE-NEXT:    movdqa %xmm7, %xmm8
3655; SSE-NEXT:    por %xmm1, %xmm3
3656; SSE-NEXT:    movdqa %xmm3, %xmm1
3657; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15]
3658; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,65535]
3659; SSE-NEXT:    movdqa %xmm7, %xmm4
3660; SSE-NEXT:    pandn %xmm1, %xmm4
3661; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7]
3662; SSE-NEXT:    pand %xmm7, %xmm3
3663; SSE-NEXT:    movdqa %xmm7, %xmm15
3664; SSE-NEXT:    por %xmm4, %xmm3
3665; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3]
3666; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
3667; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1]
3668; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
3669; SSE-NEXT:    packuswb %xmm1, %xmm1
3670; SSE-NEXT:    pand %xmm0, %xmm1
3671; SSE-NEXT:    movdqa %xmm0, %xmm3
3672; SSE-NEXT:    pandn %xmm2, %xmm3
3673; SSE-NEXT:    por %xmm3, %xmm1
3674; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,0,65535]
3675; SSE-NEXT:    movdqa %xmm7, %xmm2
3676; SSE-NEXT:    pandn %xmm13, %xmm2
3677; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3678; SSE-NEXT:    movdqa %xmm5, %xmm3
3679; SSE-NEXT:    pand %xmm7, %xmm3
3680; SSE-NEXT:    por %xmm2, %xmm3
3681; SSE-NEXT:    movdqa %xmm3, %xmm2
3682; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
3683; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7]
3684; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
3685; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
3686; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
3687; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
3688; SSE-NEXT:    movdqa %xmm14, %xmm3
3689; SSE-NEXT:    movdqa %xmm14, %xmm4
3690; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
3691; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3692; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7]
3693; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3694; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
3695; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
3696; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
3697; SSE-NEXT:    packuswb %xmm3, %xmm3
3698; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0]
3699; SSE-NEXT:    movdqa %xmm5, %xmm4
3700; SSE-NEXT:    pandn %xmm3, %xmm4
3701; SSE-NEXT:    packuswb %xmm2, %xmm2
3702; SSE-NEXT:    pand %xmm5, %xmm2
3703; SSE-NEXT:    movdqa %xmm5, %xmm6
3704; SSE-NEXT:    por %xmm2, %xmm4
3705; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,0,0]
3706; SSE-NEXT:    movdqa %xmm5, %xmm2
3707; SSE-NEXT:    pandn %xmm4, %xmm2
3708; SSE-NEXT:    pand %xmm5, %xmm1
3709; SSE-NEXT:    por %xmm1, %xmm2
3710; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3711; SSE-NEXT:    movdqa 32(%rdi), %xmm2
3712; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3713; SSE-NEXT:    movdqa %xmm9, %xmm1
3714; SSE-NEXT:    pandn %xmm2, %xmm1
3715; SSE-NEXT:    movdqa 48(%rdi), %xmm2
3716; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3717; SSE-NEXT:    pand %xmm9, %xmm2
3718; SSE-NEXT:    por %xmm1, %xmm2
3719; SSE-NEXT:    movdqa %xmm2, %xmm1
3720; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15]
3721; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
3722; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3723; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
3724; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
3725; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6]
3726; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
3727; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
3728; SSE-NEXT:    packuswb %xmm1, %xmm2
3729; SSE-NEXT:    movdqa 16(%rdi), %xmm14
3730; SSE-NEXT:    movdqa %xmm8, %xmm1
3731; SSE-NEXT:    pandn %xmm14, %xmm1
3732; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3733; SSE-NEXT:    movdqa (%rdi), %xmm4
3734; SSE-NEXT:    movdqa %xmm4, %xmm3
3735; SSE-NEXT:    movdqa %xmm4, %xmm9
3736; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3737; SSE-NEXT:    pand %xmm8, %xmm3
3738; SSE-NEXT:    por %xmm1, %xmm3
3739; SSE-NEXT:    movdqa %xmm3, %xmm1
3740; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15]
3741; SSE-NEXT:    movdqa %xmm15, %xmm4
3742; SSE-NEXT:    pandn %xmm1, %xmm4
3743; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7]
3744; SSE-NEXT:    pand %xmm15, %xmm3
3745; SSE-NEXT:    por %xmm4, %xmm3
3746; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3]
3747; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
3748; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1]
3749; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
3750; SSE-NEXT:    packuswb %xmm1, %xmm1
3751; SSE-NEXT:    pand %xmm0, %xmm1
3752; SSE-NEXT:    pandn %xmm2, %xmm0
3753; SSE-NEXT:    por %xmm0, %xmm1
3754; SSE-NEXT:    movdqa 64(%rdi), %xmm15
3755; SSE-NEXT:    movdqa %xmm7, %xmm0
3756; SSE-NEXT:    pandn %xmm15, %xmm0
3757; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3758; SSE-NEXT:    movdqa 80(%rdi), %xmm8
3759; SSE-NEXT:    movdqa %xmm8, %xmm2
3760; SSE-NEXT:    pand %xmm7, %xmm2
3761; SSE-NEXT:    por %xmm0, %xmm2
3762; SSE-NEXT:    movdqa %xmm2, %xmm0
3763; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
3764; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
3765; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15]
3766; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
3767; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
3768; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
3769; SSE-NEXT:    movdqa 96(%rdi), %xmm2
3770; SSE-NEXT:    movdqa %xmm2, %xmm3
3771; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
3772; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3773; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
3774; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3775; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3776; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
3777; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
3778; SSE-NEXT:    packuswb %xmm2, %xmm2
3779; SSE-NEXT:    movdqa %xmm6, %xmm3
3780; SSE-NEXT:    pandn %xmm2, %xmm3
3781; SSE-NEXT:    packuswb %xmm0, %xmm0
3782; SSE-NEXT:    pand %xmm6, %xmm0
3783; SSE-NEXT:    por %xmm0, %xmm3
3784; SSE-NEXT:    pand %xmm5, %xmm1
3785; SSE-NEXT:    pandn %xmm3, %xmm5
3786; SSE-NEXT:    por %xmm1, %xmm5
3787; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3788; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,0,65535,65535]
3789; SSE-NEXT:    movdqa %xmm2, %xmm0
3790; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3791; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3792; SSE-NEXT:    pand %xmm2, %xmm1
3793; SSE-NEXT:    por %xmm0, %xmm1
3794; SSE-NEXT:    movdqa %xmm1, %xmm2
3795; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15]
3796; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535]
3797; SSE-NEXT:    movdqa %xmm0, %xmm3
3798; SSE-NEXT:    pandn %xmm2, %xmm3
3799; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
3800; SSE-NEXT:    pand %xmm0, %xmm1
3801; SSE-NEXT:    por %xmm3, %xmm1
3802; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
3803; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
3804; SSE-NEXT:    psrld $16, %xmm2
3805; SSE-NEXT:    packuswb %xmm2, %xmm1
3806; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255]
3807; SSE-NEXT:    movdqa %xmm5, %xmm2
3808; SSE-NEXT:    pandn %xmm1, %xmm2
3809; SSE-NEXT:    movdqa %xmm7, %xmm1
3810; SSE-NEXT:    pandn %xmm11, %xmm1
3811; SSE-NEXT:    movdqa %xmm12, %xmm3
3812; SSE-NEXT:    pand %xmm7, %xmm3
3813; SSE-NEXT:    movdqa %xmm7, %xmm12
3814; SSE-NEXT:    por %xmm1, %xmm3
3815; SSE-NEXT:    movdqa %xmm3, %xmm1
3816; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
3817; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,0,65535]
3818; SSE-NEXT:    movdqa %xmm7, %xmm4
3819; SSE-NEXT:    pandn %xmm1, %xmm4
3820; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
3821; SSE-NEXT:    pand %xmm7, %xmm3
3822; SSE-NEXT:    por %xmm4, %xmm3
3823; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3]
3824; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
3825; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
3826; SSE-NEXT:    packuswb %xmm1, %xmm1
3827; SSE-NEXT:    pand %xmm5, %xmm1
3828; SSE-NEXT:    movdqa %xmm5, %xmm7
3829; SSE-NEXT:    por %xmm2, %xmm1
3830; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,65535,0,65535]
3831; SSE-NEXT:    movdqa %xmm4, %xmm2
3832; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3833; SSE-NEXT:    movdqa %xmm13, %xmm3
3834; SSE-NEXT:    pand %xmm4, %xmm3
3835; SSE-NEXT:    movdqa %xmm4, %xmm13
3836; SSE-NEXT:    por %xmm2, %xmm3
3837; SSE-NEXT:    movdqa %xmm3, %xmm2
3838; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15]
3839; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,0,65535,0,65535]
3840; SSE-NEXT:    movdqa %xmm11, %xmm4
3841; SSE-NEXT:    pandn %xmm2, %xmm4
3842; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7]
3843; SSE-NEXT:    pand %xmm11, %xmm3
3844; SSE-NEXT:    por %xmm4, %xmm3
3845; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3846; SSE-NEXT:    pslld $16, %xmm2
3847; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3848; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
3849; SSE-NEXT:    packuswb %xmm4, %xmm2
3850; SSE-NEXT:    movdqa %xmm6, %xmm4
3851; SSE-NEXT:    pandn %xmm2, %xmm4
3852; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,3,2,3]
3853; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7]
3854; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7]
3855; SSE-NEXT:    packuswb %xmm2, %xmm2
3856; SSE-NEXT:    pand %xmm6, %xmm2
3857; SSE-NEXT:    por %xmm2, %xmm4
3858; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
3859; SSE-NEXT:    movdqa %xmm3, %xmm2
3860; SSE-NEXT:    pandn %xmm4, %xmm2
3861; SSE-NEXT:    pand %xmm3, %xmm1
3862; SSE-NEXT:    por %xmm1, %xmm2
3863; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3864; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535]
3865; SSE-NEXT:    movdqa %xmm5, %xmm1
3866; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3867; SSE-NEXT:    pandn %xmm6, %xmm1
3868; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3869; SSE-NEXT:    movdqa %xmm10, %xmm2
3870; SSE-NEXT:    pand %xmm5, %xmm2
3871; SSE-NEXT:    por %xmm1, %xmm2
3872; SSE-NEXT:    movdqa %xmm2, %xmm1
3873; SSE-NEXT:    pxor %xmm3, %xmm3
3874; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
3875; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
3876; SSE-NEXT:    pxor %xmm5, %xmm5
3877; SSE-NEXT:    pand %xmm0, %xmm2
3878; SSE-NEXT:    pandn %xmm1, %xmm0
3879; SSE-NEXT:    por %xmm2, %xmm0
3880; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
3881; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,6]
3882; SSE-NEXT:    psrld $16, %xmm1
3883; SSE-NEXT:    packuswb %xmm1, %xmm0
3884; SSE-NEXT:    movdqa %xmm7, %xmm4
3885; SSE-NEXT:    movdqa %xmm7, %xmm1
3886; SSE-NEXT:    pandn %xmm0, %xmm1
3887; SSE-NEXT:    movdqa %xmm12, %xmm0
3888; SSE-NEXT:    pandn %xmm14, %xmm0
3889; SSE-NEXT:    movdqa %xmm9, %xmm2
3890; SSE-NEXT:    pand %xmm12, %xmm2
3891; SSE-NEXT:    por %xmm0, %xmm2
3892; SSE-NEXT:    movdqa %xmm2, %xmm0
3893; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
3894; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,0,65535]
3895; SSE-NEXT:    movdqa %xmm7, %xmm3
3896; SSE-NEXT:    pandn %xmm0, %xmm3
3897; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
3898; SSE-NEXT:    pand %xmm7, %xmm2
3899; SSE-NEXT:    por %xmm3, %xmm2
3900; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3]
3901; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
3902; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
3903; SSE-NEXT:    packuswb %xmm0, %xmm0
3904; SSE-NEXT:    pand %xmm4, %xmm0
3905; SSE-NEXT:    por %xmm1, %xmm0
3906; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3907; SSE-NEXT:    movdqa %xmm13, %xmm0
3908; SSE-NEXT:    pandn %xmm8, %xmm0
3909; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3910; SSE-NEXT:    movdqa %xmm15, %xmm1
3911; SSE-NEXT:    pand %xmm13, %xmm1
3912; SSE-NEXT:    por %xmm0, %xmm1
3913; SSE-NEXT:    movdqa %xmm1, %xmm0
3914; SSE-NEXT:    pxor %xmm2, %xmm2
3915; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
3916; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
3917; SSE-NEXT:    movdqa %xmm11, %xmm2
3918; SSE-NEXT:    pand %xmm11, %xmm1
3919; SSE-NEXT:    pandn %xmm0, %xmm2
3920; SSE-NEXT:    por %xmm1, %xmm2
3921; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3922; SSE-NEXT:    movdqa %xmm13, %xmm0
3923; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3924; SSE-NEXT:    pandn %xmm1, %xmm0
3925; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3926; SSE-NEXT:    movdqa %xmm5, %xmm9
3927; SSE-NEXT:    pand %xmm13, %xmm9
3928; SSE-NEXT:    por %xmm0, %xmm9
3929; SSE-NEXT:    movdqa %xmm6, %xmm0
3930; SSE-NEXT:    pand %xmm13, %xmm0
3931; SSE-NEXT:    pandn %xmm10, %xmm13
3932; SSE-NEXT:    por %xmm0, %xmm13
3933; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3934; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,65535]
3935; SSE-NEXT:    movdqa %xmm2, %xmm0
3936; SSE-NEXT:    pandn %xmm5, %xmm0
3937; SSE-NEXT:    movdqa %xmm12, %xmm7
3938; SSE-NEXT:    movdqa %xmm12, %xmm5
3939; SSE-NEXT:    pandn %xmm1, %xmm5
3940; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3941; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm1[0,2,2,3]
3942; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3943; SSE-NEXT:    pand %xmm2, %xmm1
3944; SSE-NEXT:    por %xmm0, %xmm1
3945; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3946; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3947; SSE-NEXT:    pand %xmm2, %xmm13
3948; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3949; SSE-NEXT:    pand %xmm2, %xmm12
3950; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3951; SSE-NEXT:    pand %xmm2, %xmm14
3952; SSE-NEXT:    pand %xmm2, %xmm8
3953; SSE-NEXT:    movdqa %xmm8, (%rsp) # 16-byte Spill
3954; SSE-NEXT:    movdqa %xmm7, %xmm1
3955; SSE-NEXT:    pandn %xmm10, %xmm1
3956; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3957; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm10[0,2,2,3]
3958; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3959; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3960; SSE-NEXT:    pand %xmm2, %xmm10
3961; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3962; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3963; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3964; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3965; SSE-NEXT:    pandn %xmm6, %xmm2
3966; SSE-NEXT:    por %xmm10, %xmm2
3967; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3968; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535]
3969; SSE-NEXT:    movdqa %xmm7, %xmm1
3970; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3971; SSE-NEXT:    pandn %xmm2, %xmm1
3972; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3973; SSE-NEXT:    movdqa %xmm2, %xmm5
3974; SSE-NEXT:    movdqa %xmm2, %xmm3
3975; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3976; SSE-NEXT:    movdqa %xmm6, %xmm8
3977; SSE-NEXT:    pslld $16, %xmm8
3978; SSE-NEXT:    psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3979; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3980; SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
3981; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3982; SSE-NEXT:    movdqa %xmm1, %xmm15
3983; SSE-NEXT:    psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3984; SSE-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3]
3985; SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
3986; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3987; SSE-NEXT:    movdqa %xmm0, %xmm3
3988; SSE-NEXT:    movdqa %xmm1, %xmm0
3989; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
3990; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3991; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3992; SSE-NEXT:    pxor %xmm10, %xmm10
3993; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
3994; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,7]
3995; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,1,2,0]
3996; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,6,4,6,5]
3997; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,0,65535]
3998; SSE-NEXT:    pand %xmm4, %xmm0
3999; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4000; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4001; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4002; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
4003; SSE-NEXT:    pxor %xmm9, %xmm9
4004; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,7,5,6,7]
4005; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0]
4006; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,4,6,5]
4007; SSE-NEXT:    movdqa %xmm4, %xmm0
4008; SSE-NEXT:    pand %xmm4, %xmm10
4009; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4010; SSE-NEXT:    pandn %xmm3, %xmm4
4011; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4012; SSE-NEXT:    pand %xmm0, %xmm2
4013; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4014; SSE-NEXT:    movdqa %xmm1, %xmm4
4015; SSE-NEXT:    pand %xmm0, %xmm4
4016; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4017; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4018; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4019; SSE-NEXT:    movdqa %xmm6, %xmm4
4020; SSE-NEXT:    pandn %xmm6, %xmm0
4021; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4022; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4023; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4024; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
4025; SSE-NEXT:    pand %xmm7, %xmm0
4026; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4027; SSE-NEXT:    pand %xmm7, %xmm3
4028; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4029; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4030; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4031; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
4032; SSE-NEXT:    pand %xmm7, %xmm6
4033; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4034; SSE-NEXT:    pand %xmm7, %xmm4
4035; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4036; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4037; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4038; SSE-NEXT:    pandn %xmm1, %xmm7
4039; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4040; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3]
4041; SSE-NEXT:    packuswb %xmm1, %xmm1
4042; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,65535,0]
4043; SSE-NEXT:    movdqa %xmm0, %xmm10
4044; SSE-NEXT:    pandn %xmm1, %xmm10
4045; SSE-NEXT:    pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4046; SSE-NEXT:    # xmm1 = mem[0,3,2,3]
4047; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7]
4048; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7]
4049; SSE-NEXT:    packuswb %xmm1, %xmm1
4050; SSE-NEXT:    pand %xmm0, %xmm1
4051; SSE-NEXT:    movdqa %xmm0, %xmm2
4052; SSE-NEXT:    por %xmm1, %xmm10
4053; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
4054; SSE-NEXT:    movdqa %xmm0, %xmm3
4055; SSE-NEXT:    pandn %xmm10, %xmm3
4056; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4057; SSE-NEXT:    pand %xmm0, %xmm1
4058; SSE-NEXT:    movdqa %xmm0, %xmm8
4059; SSE-NEXT:    por %xmm1, %xmm3
4060; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4061; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,0,65535]
4062; SSE-NEXT:    movdqa %xmm0, %xmm1
4063; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4064; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4065; SSE-NEXT:    pand %xmm0, %xmm10
4066; SSE-NEXT:    por %xmm1, %xmm10
4067; SSE-NEXT:    movdqa %xmm10, %xmm1
4068; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
4069; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535]
4070; SSE-NEXT:    movdqa %xmm3, %xmm0
4071; SSE-NEXT:    pandn %xmm1, %xmm0
4072; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
4073; SSE-NEXT:    pand %xmm3, %xmm10
4074; SSE-NEXT:    por %xmm0, %xmm10
4075; SSE-NEXT:    packuswb %xmm5, %xmm0
4076; SSE-NEXT:    movdqa %xmm2, %xmm1
4077; SSE-NEXT:    pandn %xmm0, %xmm1
4078; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[0,3,2,3]
4079; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
4080; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
4081; SSE-NEXT:    packuswb %xmm0, %xmm0
4082; SSE-NEXT:    pand %xmm2, %xmm0
4083; SSE-NEXT:    por %xmm0, %xmm1
4084; SSE-NEXT:    movdqa %xmm8, %xmm0
4085; SSE-NEXT:    pandn %xmm1, %xmm0
4086; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4087; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4088; SSE-NEXT:    por %xmm1, %xmm13
4089; SSE-NEXT:    movdqa %xmm13, %xmm1
4090; SSE-NEXT:    pxor %xmm6, %xmm6
4091; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
4092; SSE-NEXT:    movdqa %xmm3, %xmm2
4093; SSE-NEXT:    pandn %xmm1, %xmm2
4094; SSE-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7]
4095; SSE-NEXT:    pand %xmm3, %xmm13
4096; SSE-NEXT:    movdqa %xmm3, %xmm5
4097; SSE-NEXT:    por %xmm2, %xmm13
4098; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4099; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[1,3,2,3]
4100; SSE-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1]
4101; SSE-NEXT:    movdqa %xmm11, %xmm2
4102; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15]
4103; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535]
4104; SSE-NEXT:    movdqa %xmm1, %xmm10
4105; SSE-NEXT:    pandn %xmm2, %xmm10
4106; SSE-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
4107; SSE-NEXT:    pand %xmm1, %xmm11
4108; SSE-NEXT:    por %xmm10, %xmm11
4109; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm11[0,1,2,1]
4110; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
4111; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
4112; SSE-NEXT:    packuswb %xmm2, %xmm3
4113; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255]
4114; SSE-NEXT:    movdqa %xmm6, %xmm4
4115; SSE-NEXT:    pandn %xmm3, %xmm4
4116; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm13[0,2,1,3]
4117; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
4118; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,3,3]
4119; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7]
4120; SSE-NEXT:    packuswb %xmm2, %xmm2
4121; SSE-NEXT:    pand %xmm6, %xmm2
4122; SSE-NEXT:    movdqa %xmm6, %xmm13
4123; SSE-NEXT:    por %xmm2, %xmm4
4124; SSE-NEXT:    pand %xmm8, %xmm4
4125; SSE-NEXT:    por %xmm0, %xmm4
4126; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4127; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,0,65535]
4128; SSE-NEXT:    movdqa %xmm10, %xmm0
4129; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4130; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4131; SSE-NEXT:    pand %xmm10, %xmm2
4132; SSE-NEXT:    por %xmm0, %xmm2
4133; SSE-NEXT:    movdqa %xmm2, %xmm0
4134; SSE-NEXT:    pxor %xmm6, %xmm6
4135; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
4136; SSE-NEXT:    movdqa %xmm5, %xmm11
4137; SSE-NEXT:    movdqa %xmm5, %xmm3
4138; SSE-NEXT:    pandn %xmm0, %xmm3
4139; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15]
4140; SSE-NEXT:    pand %xmm5, %xmm2
4141; SSE-NEXT:    por %xmm3, %xmm2
4142; SSE-NEXT:    packuswb %xmm15, %xmm0
4143; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,65535,0]
4144; SSE-NEXT:    movdqa %xmm4, %xmm3
4145; SSE-NEXT:    pandn %xmm0, %xmm3
4146; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3]
4147; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
4148; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
4149; SSE-NEXT:    packuswb %xmm0, %xmm0
4150; SSE-NEXT:    pand %xmm4, %xmm0
4151; SSE-NEXT:    por %xmm0, %xmm3
4152; SSE-NEXT:    movdqa %xmm8, %xmm0
4153; SSE-NEXT:    movdqa %xmm8, %xmm15
4154; SSE-NEXT:    pandn %xmm3, %xmm0
4155; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4156; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4157; SSE-NEXT:    pandn %xmm5, %xmm2
4158; SSE-NEXT:    por %xmm2, %xmm12
4159; SSE-NEXT:    movdqa %xmm12, %xmm2
4160; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15]
4161; SSE-NEXT:    movdqa %xmm11, %xmm3
4162; SSE-NEXT:    pandn %xmm2, %xmm3
4163; SSE-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
4164; SSE-NEXT:    pand %xmm11, %xmm12
4165; SSE-NEXT:    por %xmm3, %xmm12
4166; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4167; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm8[1,3,2,3]
4168; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4169; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4170; SSE-NEXT:    movdqa %xmm3, %xmm2
4171; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15]
4172; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
4173; SSE-NEXT:    pand %xmm1, %xmm3
4174; SSE-NEXT:    pandn %xmm2, %xmm1
4175; SSE-NEXT:    por %xmm3, %xmm1
4176; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
4177; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
4178; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
4179; SSE-NEXT:    packuswb %xmm2, %xmm1
4180; SSE-NEXT:    movdqa %xmm13, %xmm2
4181; SSE-NEXT:    pandn %xmm1, %xmm2
4182; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[0,2,1,3]
4183; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
4184; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,3,3]
4185; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7]
4186; SSE-NEXT:    packuswb %xmm1, %xmm1
4187; SSE-NEXT:    pand %xmm13, %xmm1
4188; SSE-NEXT:    por %xmm1, %xmm2
4189; SSE-NEXT:    pand %xmm15, %xmm2
4190; SSE-NEXT:    por %xmm0, %xmm2
4191; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4192; SSE-NEXT:    movdqa %xmm10, %xmm0
4193; SSE-NEXT:    pandn %xmm7, %xmm0
4194; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4195; SSE-NEXT:    pand %xmm10, %xmm2
4196; SSE-NEXT:    por %xmm0, %xmm2
4197; SSE-NEXT:    movdqa %xmm2, %xmm0
4198; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
4199; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
4200; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
4201; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7]
4202; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4203; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
4204; SSE-NEXT:    psrlq $48, %xmm0
4205; SSE-NEXT:    packuswb %xmm0, %xmm1
4206; SSE-NEXT:    movdqa %xmm13, %xmm0
4207; SSE-NEXT:    pandn %xmm1, %xmm0
4208; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,0,65535,65535]
4209; SSE-NEXT:    movdqa %xmm3, %xmm1
4210; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4211; SSE-NEXT:    pandn %xmm9, %xmm1
4212; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4213; SSE-NEXT:    movdqa %xmm7, %xmm2
4214; SSE-NEXT:    pand %xmm3, %xmm2
4215; SSE-NEXT:    por %xmm1, %xmm2
4216; SSE-NEXT:    movdqa %xmm2, %xmm1
4217; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
4218; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [65535,0,65535,0,65535,65535,65535,65535]
4219; SSE-NEXT:    movdqa %xmm4, %xmm3
4220; SSE-NEXT:    pandn %xmm1, %xmm3
4221; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15]
4222; SSE-NEXT:    pand %xmm4, %xmm2
4223; SSE-NEXT:    por %xmm3, %xmm2
4224; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[3,2,1,0,4,5,6,7]
4225; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
4226; SSE-NEXT:    packuswb %xmm1, %xmm1
4227; SSE-NEXT:    pand %xmm13, %xmm1
4228; SSE-NEXT:    por %xmm0, %xmm1
4229; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4230; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4231; SSE-NEXT:    pandn %xmm12, %xmm0
4232; SSE-NEXT:    por %xmm0, %xmm14
4233; SSE-NEXT:    movdqa %xmm14, %xmm0
4234; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
4235; SSE-NEXT:    movdqa %xmm11, %xmm2
4236; SSE-NEXT:    pandn %xmm0, %xmm2
4237; SSE-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7]
4238; SSE-NEXT:    pand %xmm11, %xmm14
4239; SSE-NEXT:    por %xmm2, %xmm14
4240; SSE-NEXT:    pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4241; SSE-NEXT:    # xmm0 = mem[2,1,2,3,4,5,6,7]
4242; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
4243; SSE-NEXT:    packuswb %xmm0, %xmm0
4244; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0]
4245; SSE-NEXT:    movdqa %xmm10, %xmm2
4246; SSE-NEXT:    pandn %xmm0, %xmm2
4247; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm14[0,2,1,0,4,5,6,7]
4248; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
4249; SSE-NEXT:    packuswb %xmm0, %xmm0
4250; SSE-NEXT:    pand %xmm10, %xmm0
4251; SSE-NEXT:    por %xmm0, %xmm2
4252; SSE-NEXT:    movdqa %xmm15, %xmm0
4253; SSE-NEXT:    pandn %xmm2, %xmm0
4254; SSE-NEXT:    pand %xmm15, %xmm1
4255; SSE-NEXT:    por %xmm1, %xmm0
4256; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4257; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,65535,65535,0,65535]
4258; SSE-NEXT:    movdqa %xmm11, %xmm0
4259; SSE-NEXT:    pandn %xmm8, %xmm0
4260; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4261; SSE-NEXT:    pand %xmm11, %xmm1
4262; SSE-NEXT:    por %xmm0, %xmm1
4263; SSE-NEXT:    movdqa %xmm1, %xmm0
4264; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
4265; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
4266; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
4267; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
4268; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4269; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
4270; SSE-NEXT:    psrlq $48, %xmm0
4271; SSE-NEXT:    packuswb %xmm0, %xmm1
4272; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,0,65535,65535]
4273; SSE-NEXT:    movdqa %xmm8, %xmm0
4274; SSE-NEXT:    movdqa %xmm5, %xmm11
4275; SSE-NEXT:    pandn %xmm5, %xmm0
4276; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4277; SSE-NEXT:    movdqa %xmm3, %xmm2
4278; SSE-NEXT:    pand %xmm8, %xmm2
4279; SSE-NEXT:    por %xmm0, %xmm2
4280; SSE-NEXT:    movdqa %xmm2, %xmm0
4281; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
4282; SSE-NEXT:    movdqa %xmm4, %xmm5
4283; SSE-NEXT:    pandn %xmm0, %xmm5
4284; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15]
4285; SSE-NEXT:    pand %xmm4, %xmm2
4286; SSE-NEXT:    por %xmm5, %xmm2
4287; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[3,2,1,0,4,5,6,7]
4288; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
4289; SSE-NEXT:    packuswb %xmm0, %xmm0
4290; SSE-NEXT:    pand %xmm13, %xmm0
4291; SSE-NEXT:    pandn %xmm1, %xmm13
4292; SSE-NEXT:    por %xmm13, %xmm0
4293; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4294; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4295; SSE-NEXT:    pandn %xmm8, %xmm1
4296; SSE-NEXT:    movdqa (%rsp), %xmm5 # 16-byte Reload
4297; SSE-NEXT:    por %xmm1, %xmm5
4298; SSE-NEXT:    movdqa %xmm5, %xmm1
4299; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
4300; SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
4301; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,0,65535]
4302; SSE-NEXT:    pand %xmm2, %xmm5
4303; SSE-NEXT:    pandn %xmm1, %xmm2
4304; SSE-NEXT:    por %xmm5, %xmm2
4305; SSE-NEXT:    movdqa %xmm2, %xmm5
4306; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4307; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm13[2,1,2,3,4,5,6,7]
4308; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
4309; SSE-NEXT:    packuswb %xmm1, %xmm1
4310; SSE-NEXT:    movdqa %xmm10, %xmm2
4311; SSE-NEXT:    pandn %xmm1, %xmm2
4312; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm5[0,2,1,0,4,5,6,7]
4313; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7]
4314; SSE-NEXT:    packuswb %xmm1, %xmm1
4315; SSE-NEXT:    pand %xmm10, %xmm1
4316; SSE-NEXT:    por %xmm1, %xmm2
4317; SSE-NEXT:    movdqa %xmm15, %xmm1
4318; SSE-NEXT:    pandn %xmm2, %xmm1
4319; SSE-NEXT:    pand %xmm15, %xmm0
4320; SSE-NEXT:    movdqa %xmm15, %xmm14
4321; SSE-NEXT:    por %xmm0, %xmm1
4322; SSE-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
4323; SSE-NEXT:    movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,0,65535,65535]
4324; SSE-NEXT:    movdqa %xmm15, %xmm0
4325; SSE-NEXT:    pandn %xmm9, %xmm0
4326; SSE-NEXT:    movdqa %xmm7, %xmm2
4327; SSE-NEXT:    pand %xmm15, %xmm2
4328; SSE-NEXT:    por %xmm0, %xmm2
4329; SSE-NEXT:    movdqa %xmm2, %xmm0
4330; SSE-NEXT:    pxor %xmm1, %xmm1
4331; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
4332; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
4333; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
4334; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
4335; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
4336; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
4337; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4338; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
4339; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4340; SSE-NEXT:    pandn %xmm0, %xmm6
4341; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4342; SSE-NEXT:    por %xmm6, %xmm5
4343; SSE-NEXT:    packuswb %xmm0, %xmm5
4344; SSE-NEXT:    packuswb %xmm2, %xmm2
4345; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,1,3,3]
4346; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
4347; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [65535,0,65535,65535,65535,0,65535,65535]
4348; SSE-NEXT:    movdqa %xmm9, %xmm2
4349; SSE-NEXT:    pandn %xmm12, %xmm2
4350; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4351; SSE-NEXT:    movdqa %xmm7, %xmm5
4352; SSE-NEXT:    pand %xmm9, %xmm5
4353; SSE-NEXT:    por %xmm2, %xmm5
4354; SSE-NEXT:    movdqa %xmm5, %xmm2
4355; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
4356; SSE-NEXT:    movdqa %xmm4, %xmm6
4357; SSE-NEXT:    pandn %xmm2, %xmm6
4358; SSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
4359; SSE-NEXT:    pand %xmm4, %xmm5
4360; SSE-NEXT:    por %xmm6, %xmm5
4361; SSE-NEXT:    pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4362; SSE-NEXT:    # xmm2 = mem[0,1,2,1]
4363; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
4364; SSE-NEXT:    packuswb %xmm2, %xmm2
4365; SSE-NEXT:    movdqa %xmm10, %xmm6
4366; SSE-NEXT:    pandn %xmm2, %xmm6
4367; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3]
4368; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7]
4369; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
4370; SSE-NEXT:    packuswb %xmm2, %xmm2
4371; SSE-NEXT:    pand %xmm10, %xmm2
4372; SSE-NEXT:    por %xmm2, %xmm6
4373; SSE-NEXT:    movdqa %xmm14, %xmm1
4374; SSE-NEXT:    pandn %xmm6, %xmm14
4375; SSE-NEXT:    andps %xmm1, %xmm0
4376; SSE-NEXT:    movdqa %xmm1, %xmm6
4377; SSE-NEXT:    por %xmm0, %xmm14
4378; SSE-NEXT:    movdqa %xmm15, %xmm1
4379; SSE-NEXT:    movdqa %xmm15, %xmm0
4380; SSE-NEXT:    movdqa %xmm11, %xmm15
4381; SSE-NEXT:    pandn %xmm11, %xmm0
4382; SSE-NEXT:    pand %xmm1, %xmm3
4383; SSE-NEXT:    por %xmm0, %xmm3
4384; SSE-NEXT:    movdqa %xmm3, %xmm0
4385; SSE-NEXT:    pxor %xmm1, %xmm1
4386; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
4387; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
4388; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
4389; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[2,1,2,3]
4390; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
4391; SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
4392; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4393; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
4394; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4395; SSE-NEXT:    pandn %xmm0, %xmm3
4396; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4397; SSE-NEXT:    por %xmm3, %xmm2
4398; SSE-NEXT:    packuswb %xmm0, %xmm2
4399; SSE-NEXT:    packuswb %xmm5, %xmm5
4400; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,3,3]
4401; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3]
4402; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4403; SSE-NEXT:    movdqa %xmm2, %xmm5
4404; SSE-NEXT:    movdqa %xmm2, %xmm3
4405; SSE-NEXT:    movdqa %xmm9, %xmm2
4406; SSE-NEXT:    pand %xmm9, %xmm5
4407; SSE-NEXT:    pandn %xmm8, %xmm2
4408; SSE-NEXT:    movdqa %xmm8, %xmm9
4409; SSE-NEXT:    por %xmm5, %xmm2
4410; SSE-NEXT:    movdqa %xmm2, %xmm5
4411; SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
4412; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
4413; SSE-NEXT:    pand %xmm4, %xmm2
4414; SSE-NEXT:    pandn %xmm5, %xmm4
4415; SSE-NEXT:    por %xmm2, %xmm4
4416; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
4417; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
4418; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
4419; SSE-NEXT:    packuswb %xmm4, %xmm4
4420; SSE-NEXT:    pand %xmm10, %xmm4
4421; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm13[0,1,2,1]
4422; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
4423; SSE-NEXT:    packuswb %xmm5, %xmm5
4424; SSE-NEXT:    pandn %xmm5, %xmm10
4425; SSE-NEXT:    por %xmm4, %xmm10
4426; SSE-NEXT:    movdqa %xmm6, %xmm4
4427; SSE-NEXT:    pandn %xmm10, %xmm4
4428; SSE-NEXT:    andps %xmm6, %xmm0
4429; SSE-NEXT:    por %xmm0, %xmm4
4430; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4431; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,0,65535]
4432; SSE-NEXT:    pand %xmm2, %xmm0
4433; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4434; SSE-NEXT:    movdqa %xmm0, %xmm5
4435; SSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
4436; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [0,65535,65535,65535,65535,65535,65535,0]
4437; SSE-NEXT:    movdqa %xmm10, %xmm6
4438; SSE-NEXT:    pandn %xmm5, %xmm6
4439; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4440; SSE-NEXT:    pand %xmm10, %xmm0
4441; SSE-NEXT:    por %xmm6, %xmm0
4442; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
4443; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,5,4,7,6]
4444; SSE-NEXT:    psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4445; SSE-NEXT:    packuswb %xmm5, %xmm8
4446; SSE-NEXT:    movdqa %xmm2, %xmm11
4447; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4448; SSE-NEXT:    pandn %xmm0, %xmm11
4449; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
4450; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4451; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm13[0,2,2,3]
4452; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
4453; SSE-NEXT:    movdqa %xmm5, %xmm0
4454; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
4455; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
4456; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
4457; SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
4458; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
4459; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
4460; SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
4461; SSE-NEXT:    packuswb %xmm5, %xmm5
4462; SSE-NEXT:    movss {{.*#+}} xmm8 = xmm5[0],xmm8[1,2,3]
4463; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535]
4464; SSE-NEXT:    movdqa %xmm6, %xmm0
4465; SSE-NEXT:    pandn %xmm12, %xmm0
4466; SSE-NEXT:    movdqa %xmm7, %xmm5
4467; SSE-NEXT:    pand %xmm6, %xmm5
4468; SSE-NEXT:    por %xmm0, %xmm5
4469; SSE-NEXT:    movdqa %xmm5, %xmm0
4470; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4471; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,0,65535,65535,65535]
4472; SSE-NEXT:    movdqa %xmm6, %xmm7
4473; SSE-NEXT:    pandn %xmm0, %xmm7
4474; SSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
4475; SSE-NEXT:    pand %xmm6, %xmm5
4476; SSE-NEXT:    por %xmm7, %xmm5
4477; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4478; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4479; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
4480; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,6]
4481; SSE-NEXT:    packuswb %xmm0, %xmm0
4482; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0]
4483; SSE-NEXT:    movdqa %xmm7, %xmm12
4484; SSE-NEXT:    pandn %xmm0, %xmm12
4485; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,1,0,3]
4486; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
4487; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
4488; SSE-NEXT:    packuswb %xmm0, %xmm0
4489; SSE-NEXT:    pand %xmm7, %xmm0
4490; SSE-NEXT:    por %xmm0, %xmm12
4491; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
4492; SSE-NEXT:    movdqa %xmm0, %xmm5
4493; SSE-NEXT:    pandn %xmm12, %xmm5
4494; SSE-NEXT:    andps %xmm0, %xmm8
4495; SSE-NEXT:    por %xmm8, %xmm5
4496; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4497; SSE-NEXT:    pand %xmm2, %xmm0
4498; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4499; SSE-NEXT:    movdqa %xmm0, %xmm12
4500; SSE-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15]
4501; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4502; SSE-NEXT:    pand %xmm10, %xmm0
4503; SSE-NEXT:    pandn %xmm12, %xmm10
4504; SSE-NEXT:    por %xmm0, %xmm10
4505; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[0,1,0,3]
4506; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,5,4,7,6]
4507; SSE-NEXT:    psrldq {{.*#+}} xmm12 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4508; SSE-NEXT:    packuswb %xmm12, %xmm8
4509; SSE-NEXT:    movdqa %xmm13, %xmm12
4510; SSE-NEXT:    pand %xmm2, %xmm12
4511; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4512; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[0,2,2,3]
4513; SSE-NEXT:    pand %xmm2, %xmm10
4514; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4515; SSE-NEXT:    pandn %xmm15, %xmm2
4516; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm15[1,3,2,3]
4517; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
4518; SSE-NEXT:    movdqa %xmm0, %xmm10
4519; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15]
4520; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[2,1,2,3]
4521; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7]
4522; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4523; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
4524; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
4525; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
4526; SSE-NEXT:    packuswb %xmm0, %xmm0
4527; SSE-NEXT:    movss {{.*#+}} xmm8 = xmm0[0],xmm8[1,2,3]
4528; SSE-NEXT:    movdqa %xmm3, %xmm13
4529; SSE-NEXT:    movdqa %xmm3, %xmm0
4530; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535]
4531; SSE-NEXT:    pand %xmm3, %xmm0
4532; SSE-NEXT:    pandn %xmm9, %xmm3
4533; SSE-NEXT:    movdqa %xmm9, %xmm15
4534; SSE-NEXT:    por %xmm0, %xmm3
4535; SSE-NEXT:    movdqa %xmm3, %xmm0
4536; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4537; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
4538; SSE-NEXT:    pand %xmm6, %xmm3
4539; SSE-NEXT:    pandn %xmm0, %xmm6
4540; SSE-NEXT:    por %xmm3, %xmm6
4541; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4542; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4543; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
4544; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,6]
4545; SSE-NEXT:    packuswb %xmm0, %xmm0
4546; SSE-NEXT:    movdqa %xmm7, %xmm9
4547; SSE-NEXT:    pandn %xmm0, %xmm9
4548; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,1,0,3]
4549; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
4550; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
4551; SSE-NEXT:    packuswb %xmm0, %xmm0
4552; SSE-NEXT:    pand %xmm7, %xmm0
4553; SSE-NEXT:    por %xmm0, %xmm9
4554; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
4555; SSE-NEXT:    movdqa %xmm3, %xmm6
4556; SSE-NEXT:    pandn %xmm9, %xmm6
4557; SSE-NEXT:    andps %xmm3, %xmm8
4558; SSE-NEXT:    por %xmm8, %xmm6
4559; SSE-NEXT:    movdqa %xmm12, %xmm1
4560; SSE-NEXT:    por %xmm11, %xmm1
4561; SSE-NEXT:    movdqa %xmm1, %xmm0
4562; SSE-NEXT:    pxor %xmm9, %xmm9
4563; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
4564; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
4565; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
4566; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm1[0,1,2,1]
4567; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7]
4568; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
4569; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4570; SSE-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15]
4571; SSE-NEXT:    pxor %xmm1, %xmm1
4572; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4573; SSE-NEXT:    pandn %xmm8, %xmm10
4574; SSE-NEXT:    movdqa %xmm8, %xmm9
4575; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4576; SSE-NEXT:    por %xmm10, %xmm8
4577; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,1,3]
4578; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7]
4579; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0]
4580; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,7,6,5]
4581; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm9[3,3,3,3]
4582; SSE-NEXT:    packuswb %xmm8, %xmm10
4583; SSE-NEXT:    packuswb %xmm0, %xmm0
4584; SSE-NEXT:    movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3]
4585; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4586; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4587; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
4588; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
4589; SSE-NEXT:    packuswb %xmm0, %xmm0
4590; SSE-NEXT:    movdqa %xmm7, %xmm8
4591; SSE-NEXT:    pandn %xmm0, %xmm8
4592; SSE-NEXT:    pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4593; SSE-NEXT:    # xmm0 = mem[1,3,2,3]
4594; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
4595; SSE-NEXT:    # xmm11 = mem[0,2,2,3]
4596; SSE-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
4597; SSE-NEXT:    movdqa %xmm11, %xmm0
4598; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4599; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,0,65535,65535]
4600; SSE-NEXT:    movdqa %xmm9, %xmm12
4601; SSE-NEXT:    pandn %xmm0, %xmm12
4602; SSE-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15]
4603; SSE-NEXT:    pand %xmm9, %xmm11
4604; SSE-NEXT:    por %xmm12, %xmm11
4605; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[2,1,1,1]
4606; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
4607; SSE-NEXT:    packuswb %xmm0, %xmm0
4608; SSE-NEXT:    pand %xmm7, %xmm0
4609; SSE-NEXT:    por %xmm8, %xmm0
4610; SSE-NEXT:    movaps %xmm3, %xmm1
4611; SSE-NEXT:    movdqa %xmm3, %xmm8
4612; SSE-NEXT:    pandn %xmm0, %xmm8
4613; SSE-NEXT:    andps %xmm3, %xmm10
4614; SSE-NEXT:    por %xmm10, %xmm8
4615; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4616; SSE-NEXT:    movdqa %xmm2, %xmm0
4617; SSE-NEXT:    pxor %xmm11, %xmm11
4618; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
4619; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
4620; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm11[8],xmm2[9],xmm11[9],xmm2[10],xmm11[10],xmm2[11],xmm11[11],xmm2[12],xmm11[12],xmm2[13],xmm11[13],xmm2[14],xmm11[14],xmm2[15],xmm11[15]
4621; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm2[0,1,2,1]
4622; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,7,6,7]
4623; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
4624; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4625; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm11[8],xmm2[9],xmm11[9],xmm2[10],xmm11[10],xmm2[11],xmm11[11],xmm2[12],xmm11[12],xmm2[13],xmm11[13],xmm2[14],xmm11[14],xmm2[15],xmm11[15]
4626; SSE-NEXT:    pxor %xmm12, %xmm12
4627; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4628; SSE-NEXT:    pandn %xmm2, %xmm10
4629; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4630; SSE-NEXT:    por %xmm10, %xmm3
4631; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm3[0,2,1,3]
4632; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7]
4633; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0]
4634; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,5]
4635; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm2[3,3,3,3]
4636; SSE-NEXT:    packuswb %xmm11, %xmm10
4637; SSE-NEXT:    packuswb %xmm0, %xmm0
4638; SSE-NEXT:    movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3]
4639; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4640; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4641; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[1,3,2,3]
4642; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm13[0,2,2,3]
4643; SSE-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
4644; SSE-NEXT:    movdqa %xmm11, %xmm0
4645; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
4646; SSE-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15]
4647; SSE-NEXT:    pand %xmm9, %xmm11
4648; SSE-NEXT:    pandn %xmm0, %xmm9
4649; SSE-NEXT:    por %xmm11, %xmm9
4650; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[2,1,1,1]
4651; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
4652; SSE-NEXT:    packuswb %xmm0, %xmm0
4653; SSE-NEXT:    pand %xmm7, %xmm0
4654; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm2[0,1,0,3]
4655; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7]
4656; SSE-NEXT:    packuswb %xmm9, %xmm9
4657; SSE-NEXT:    pandn %xmm9, %xmm7
4658; SSE-NEXT:    por %xmm7, %xmm0
4659; SSE-NEXT:    andps %xmm1, %xmm10
4660; SSE-NEXT:    andnps %xmm0, %xmm1
4661; SSE-NEXT:    orps %xmm10, %xmm1
4662; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4663; SSE-NEXT:    movaps %xmm0, (%rsi)
4664; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4665; SSE-NEXT:    movaps %xmm0, 16(%rsi)
4666; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4667; SSE-NEXT:    movaps %xmm0, (%rdx)
4668; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4669; SSE-NEXT:    movaps %xmm0, 16(%rdx)
4670; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4671; SSE-NEXT:    movaps %xmm0, (%rcx)
4672; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4673; SSE-NEXT:    movaps %xmm0, 16(%rcx)
4674; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
4675; SSE-NEXT:    movaps %xmm0, (%r8)
4676; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4677; SSE-NEXT:    movaps %xmm0, 16(%r8)
4678; SSE-NEXT:    movdqa %xmm4, (%r9)
4679; SSE-NEXT:    movdqa %xmm14, 16(%r9)
4680; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4681; SSE-NEXT:    movdqa %xmm6, (%rax)
4682; SSE-NEXT:    movdqa %xmm5, 16(%rax)
4683; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4684; SSE-NEXT:    movaps %xmm1, (%rax)
4685; SSE-NEXT:    movdqa %xmm8, 16(%rax)
4686; SSE-NEXT:    addq $648, %rsp # imm = 0x288
4687; SSE-NEXT:    retq
4688;
4689; AVX-LABEL: load_i8_stride7_vf32:
4690; AVX:       # %bb.0:
4691; AVX-NEXT:    subq $200, %rsp
4692; AVX-NEXT:    vmovdqa 176(%rdi), %xmm7
4693; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm7[u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13,u,u,u,u]
4694; AVX-NEXT:    vmovdqa 160(%rdi), %xmm6
4695; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm6[u,u,u,u]
4696; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm1
4697; AVX-NEXT:    vmovdqa 144(%rdi), %xmm8
4698; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm8[u,u,u],zero,zero,xmm8[3,10,u,u,u,u,u,u,u,u,u]
4699; AVX-NEXT:    vmovdqa 128(%rdi), %xmm9
4700; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,5,12],zero,zero,xmm9[u,u,u,u,u,u,u,u,u]
4701; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm2
4702; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,0,0,0,0,255,255,255,255,255,u,u,u,u]
4703; AVX-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
4704; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4705; AVX-NEXT:    vmovdqa (%rdi), %xmm10
4706; AVX-NEXT:    vmovdqa 16(%rdi), %xmm11
4707; AVX-NEXT:    vmovdqa 32(%rdi), %xmm15
4708; AVX-NEXT:    vmovdqa 48(%rdi), %xmm4
4709; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u,u,u,u,u]
4710; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u]
4711; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
4712; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,3,10],zero,zero,zero,xmm15[u,u,u,u,u,u]
4713; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
4714; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
4715; AVX-NEXT:    vmovq {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
4716; AVX-NEXT:    vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
4717; AVX-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4718; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm6[u,u,u,u]
4719; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14,u,u,u,u]
4720; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
4721; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,zero,xmm8[4,11,u,u,u,u,u,u,u,u,u]
4722; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,6,13],zero,zero,xmm9[u,u,u,u,u,u,u,u,u]
4723; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
4724; AVX-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
4725; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4726; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u,u,u,u,u]
4727; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm10[1,8,15],zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u]
4728; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm1
4729; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u,u,u]
4730; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,4,11],zero,zero,xmm15[u,u,u,u,u,u,u]
4731; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm2
4732; AVX-NEXT:    vmovq {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
4733; AVX-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
4734; AVX-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
4735; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm8[u,u],zero,zero,zero,xmm8[5,12,u,u,u,u,u,u,u,u,u]
4736; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm9[u,u,0,7,14],zero,zero,xmm9[u,u,u,u,u,u,u,u,u]
4737; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
4738; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm6[u,u,u,u]
4739; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15,u,u,u,u]
4740; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
4741; AVX-NEXT:    vmovq {{.*#+}} xmm3 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
4742; AVX-NEXT:    vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
4743; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4744; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u]
4745; AVX-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4746; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm11[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
4747; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
4748; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4749; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u],zero,zero,xmm4[3,10,u,u,u,u,u,u,u]
4750; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,5,12],zero,zero,xmm15[u,u,u,u,u,u,u]
4751; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
4752; AVX-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
4753; AVX-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4754; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u]
4755; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm11[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
4756; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
4757; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u],zero,zero,xmm4[4,11,u,u,u,u,u,u,u]
4758; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,6,13],zero,zero,xmm15[u,u,u,u,u,u,u]
4759; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
4760; AVX-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
4761; AVX-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4762; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm8[u,u],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u,u,u]
4763; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm9[u,u,1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u,u,u]
4764; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm1
4765; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9,u,u,u,u,u]
4766; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,4,11],zero,zero,xmm6[u,u,u,u,u]
4767; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm2
4768; AVX-NEXT:    vmovq {{.*#+}} xmm0 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
4769; AVX-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm2
4770; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm9[u,u,2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u]
4771; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm8[u,u],zero,zero,xmm8[0,7,14,u,u,u,u,u,u,u,u,u]
4772; AVX-NEXT:    vpor %xmm1, %xmm3, %xmm1
4773; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[3,10,u,u,u,u,u]
4774; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,5,12],zero,zero,xmm6[u,u,u,u,u]
4775; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
4776; AVX-NEXT:    vpblendvb %xmm0, %xmm1, %xmm3, %xmm3
4777; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm9[u,u,3,10],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u]
4778; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm8[u,u],zero,zero,xmm8[1,8,15,u,u,u,u,u,u,u,u,u]
4779; AVX-NEXT:    vpor %xmm1, %xmm4, %xmm1
4780; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[4,11,u,u,u,u,u]
4781; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,6,13],zero,zero,xmm6[u,u,u,u,u]
4782; AVX-NEXT:    vpor %xmm4, %xmm12, %xmm4
4783; AVX-NEXT:    vpblendvb %xmm0, %xmm1, %xmm4, %xmm4
4784; AVX-NEXT:    vmovdqa 192(%rdi), %xmm5
4785; AVX-NEXT:    vmovdqa 208(%rdi), %xmm1
4786; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[5,12]
4787; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
4788; AVX-NEXT:    vpor %xmm12, %xmm13, %xmm13
4789; AVX-NEXT:    vpmovsxdq {{.*#+}} xmm12 = [18446744073709486080,16777215]
4790; AVX-NEXT:    vpblendvb %xmm12, %xmm2, %xmm13, %xmm0
4791; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4792; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[6,13]
4793; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
4794; AVX-NEXT:    vpor %xmm2, %xmm14, %xmm2
4795; AVX-NEXT:    vpblendvb %xmm12, %xmm3, %xmm2, %xmm0
4796; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4797; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
4798; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14]
4799; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
4800; AVX-NEXT:    vpblendvb %xmm12, %xmm4, %xmm2, %xmm0
4801; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4802; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm8[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
4803; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm9[u,u,4,11,u,u,u,u,u,u,u,u,u,u,u,u]
4804; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4805; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u],zero,zero,zero,xmm7[5,12,u,u,u,u,u]
4806; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[u,u,u,u,u]
4807; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
4808; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3,4,5,6,7]
4809; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
4810; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15]
4811; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
4812; AVX-NEXT:    vpblendvb %xmm12, %xmm2, %xmm3, %xmm0
4813; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4814; AVX-NEXT:    vbroadcastss {{.*#+}} xmm10 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
4815; AVX-NEXT:    vpshufb %xmm10, %xmm5, %xmm2
4816; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
4817; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
4818; AVX-NEXT:    vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
4819; AVX-NEXT:    # xmm3 = mem[0,1,2,3,4,5],xmm2[6,7]
4820; AVX-NEXT:    vbroadcastss {{.*#+}} xmm7 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13]
4821; AVX-NEXT:    vmovdqa 64(%rdi), %xmm6
4822; AVX-NEXT:    vpshufb %xmm7, %xmm6, %xmm2
4823; AVX-NEXT:    vmovdqa 80(%rdi), %xmm9
4824; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,4,11,u,u,u,u,u,u,u,u,u,u]
4825; AVX-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
4826; AVX-NEXT:    vmovdqa 96(%rdi), %xmm8
4827; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7,8,9]
4828; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,0,1,2,3,8,15]
4829; AVX-NEXT:    vmovdqa 112(%rdi), %xmm2
4830; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm2[0,7,14,u,u,u,u,u,u,u,u,u,u,u,u,u]
4831; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm12, %ymm12
4832; AVX-NEXT:    vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
4833; AVX-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload
4834; AVX-NEXT:    vandnps %ymm12, %ymm13, %ymm12
4835; AVX-NEXT:    vorps %ymm0, %ymm12, %ymm0
4836; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
4837; AVX-NEXT:    vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
4838; AVX-NEXT:    vandnps %ymm3, %ymm12, %ymm3
4839; AVX-NEXT:    vandps %ymm0, %ymm12, %ymm0
4840; AVX-NEXT:    vorps %ymm3, %ymm0, %ymm0
4841; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4842; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm1[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
4843; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
4844; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
4845; AVX-NEXT:    vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
4846; AVX-NEXT:    # xmm4 = mem[0,1,2,3,4,5],xmm0[6,7]
4847; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u]
4848; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u]
4849; AVX-NEXT:    vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1]
4850; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
4851; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6],xmm3[7]
4852; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[3,10]
4853; AVX-NEXT:    vpor %xmm14, %xmm13, %xmm13
4854; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm2[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u]
4855; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm13, %ymm13
4856; AVX-NEXT:    vmovaps {{.*#+}} ymm14 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
4857; AVX-NEXT:    vandnps (%rsp), %ymm14, %ymm0 # 32-byte Folded Reload
4858; AVX-NEXT:    vandps %ymm14, %ymm13, %ymm13
4859; AVX-NEXT:    vorps %ymm0, %ymm13, %ymm0
4860; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
4861; AVX-NEXT:    vandnps %ymm4, %ymm12, %ymm4
4862; AVX-NEXT:    vandps %ymm0, %ymm12, %ymm0
4863; AVX-NEXT:    vorps %ymm4, %ymm0, %ymm0
4864; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4865; AVX-NEXT:    vpshufb %xmm10, %xmm1, %xmm0
4866; AVX-NEXT:    vpshufb %xmm7, %xmm5, %xmm1
4867; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4868; AVX-NEXT:    vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4869; AVX-NEXT:    # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7]
4870; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u]
4871; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15]
4872; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
4873; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7]
4874; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[4,11]
4875; AVX-NEXT:    vpor %xmm4, %xmm1, %xmm1
4876; AVX-NEXT:    vmovd {{.*#+}} xmm12 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
4877; AVX-NEXT:    vpshufb %xmm12, %xmm2, %xmm4
4878; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm4
4879; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
4880; AVX-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
4881; AVX-NEXT:    vandnps %ymm4, %ymm1, %ymm4
4882; AVX-NEXT:    vorps %ymm4, %ymm5, %ymm4
4883; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm5
4884; AVX-NEXT:    vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
4885; AVX-NEXT:    vandnps %ymm5, %ymm13, %ymm5
4886; AVX-NEXT:    vandps %ymm4, %ymm13, %ymm4
4887; AVX-NEXT:    vorps %ymm5, %ymm4, %ymm0
4888; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4889; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm6[u,u]
4890; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14,u,u]
4891; AVX-NEXT:    vpor %xmm4, %xmm7, %xmm4
4892; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm3[7]
4893; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[5,12]
4894; AVX-NEXT:    vpor %xmm7, %xmm4, %xmm7
4895; AVX-NEXT:    vmovd {{.*#+}} xmm4 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
4896; AVX-NEXT:    vpshufb %xmm4, %xmm2, %xmm10
4897; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm7, %ymm7
4898; AVX-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
4899; AVX-NEXT:    vandnps %ymm7, %ymm1, %ymm7
4900; AVX-NEXT:    vorps %ymm7, %ymm10, %ymm7
4901; AVX-NEXT:    vandps %ymm7, %ymm13, %ymm7
4902; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 16-byte Folded Reload
4903; AVX-NEXT:    vandnps %ymm10, %ymm13, %ymm10
4904; AVX-NEXT:    vorps %ymm7, %ymm10, %ymm0
4905; AVX-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
4906; AVX-NEXT:    vpshufb %xmm12, %xmm11, %xmm7
4907; AVX-NEXT:    vmovd {{.*#+}} xmm14 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
4908; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4909; AVX-NEXT:    vpshufb %xmm14, %xmm5, %xmm10
4910; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
4911; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4912; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u],zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u]
4913; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,u,0,7,14],zero,zero,xmm15[u,u,u,u,u,u,u]
4914; AVX-NEXT:    vpor %xmm10, %xmm12, %xmm10
4915; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3,4,5,6,7]
4916; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm6[u,u]
4917; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15,u,u]
4918; AVX-NEXT:    vpor %xmm10, %xmm12, %xmm10
4919; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5,6],xmm3[7]
4920; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[6,13]
4921; AVX-NEXT:    vpor %xmm3, %xmm10, %xmm3
4922; AVX-NEXT:    vpshufb %xmm14, %xmm2, %xmm10
4923; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm3, %ymm3
4924; AVX-NEXT:    vandps %ymm1, %ymm7, %ymm7
4925; AVX-NEXT:    vandnps %ymm3, %ymm1, %ymm3
4926; AVX-NEXT:    vorps %ymm3, %ymm7, %ymm3
4927; AVX-NEXT:    vandps %ymm3, %ymm13, %ymm3
4928; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload
4929; AVX-NEXT:    vandnps %ymm7, %ymm13, %ymm7
4930; AVX-NEXT:    vorps %ymm7, %ymm3, %ymm3
4931; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4932; AVX-NEXT:    vpshufb %xmm4, %xmm11, %xmm4
4933; AVX-NEXT:    vmovd {{.*#+}} xmm7 = [5,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
4934; AVX-NEXT:    vpshufb %xmm7, %xmm5, %xmm10
4935; AVX-NEXT:    vmovdqa %xmm5, %xmm3
4936; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3]
4937; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u],zero,zero,zero,xmm0[6,13,u,u,u,u,u,u,u]
4938; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,u,1,8,15],zero,zero,xmm15[u,u,u,u,u,u,u]
4939; AVX-NEXT:    vpor %xmm10, %xmm12, %xmm10
4940; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3,4,5,6,7]
4941; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[2,9,u,u,u]
4942; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm6[u,u,u]
4943; AVX-NEXT:    vpor %xmm10, %xmm12, %xmm10
4944; AVX-NEXT:    vmovddup {{.*#+}} xmm12 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128]
4945; AVX-NEXT:    # xmm12 = mem[0,0]
4946; AVX-NEXT:    vpshufb %xmm12, %xmm10, %xmm10
4947; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[0,7,14]
4948; AVX-NEXT:    vpor %xmm5, %xmm10, %xmm5
4949; AVX-NEXT:    vpshufb %xmm7, %xmm2, %xmm7
4950; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm5, %ymm5
4951; AVX-NEXT:    vandps %ymm1, %ymm4, %ymm4
4952; AVX-NEXT:    vandnps %ymm5, %ymm1, %ymm5
4953; AVX-NEXT:    vorps %ymm5, %ymm4, %ymm4
4954; AVX-NEXT:    vandps %ymm4, %ymm13, %ymm4
4955; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload
4956; AVX-NEXT:    vandnps %ymm5, %ymm13, %ymm5
4957; AVX-NEXT:    vorps %ymm5, %ymm4, %ymm4
4958; AVX-NEXT:    vpshufb %xmm14, %xmm11, %xmm5
4959; AVX-NEXT:    vmovd {{.*#+}} xmm7 = [6,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
4960; AVX-NEXT:    vpshufb %xmm7, %xmm3, %xmm10
4961; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
4962; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm15[u,u,u,u,2,9],zero,zero,zero,xmm15[u,u,u,u,u,u,u]
4963; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u]
4964; AVX-NEXT:    vpor %xmm10, %xmm11, %xmm10
4965; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm10[2,3,4,5,6,7]
4966; AVX-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
4967; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[3,10,u,u,u]
4968; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm6[u,u,u]
4969; AVX-NEXT:    vpor %xmm7, %xmm6, %xmm6
4970; AVX-NEXT:    vpshufb %xmm12, %xmm6, %xmm6
4971; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[1,8,15]
4972; AVX-NEXT:    vpor %xmm7, %xmm6, %xmm6
4973; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm6, %ymm2
4974; AVX-NEXT:    vandps %ymm1, %ymm5, %ymm5
4975; AVX-NEXT:    vandnps %ymm2, %ymm1, %ymm1
4976; AVX-NEXT:    vorps %ymm1, %ymm5, %ymm1
4977; AVX-NEXT:    vandps %ymm1, %ymm13, %ymm1
4978; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload
4979; AVX-NEXT:    vandnps %ymm2, %ymm13, %ymm0
4980; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
4981; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4982; AVX-NEXT:    vmovaps %ymm1, (%rsi)
4983; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4984; AVX-NEXT:    vmovaps %ymm1, (%rdx)
4985; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4986; AVX-NEXT:    vmovaps %ymm1, (%rcx)
4987; AVX-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
4988; AVX-NEXT:    vmovaps %ymm1, (%r8)
4989; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4990; AVX-NEXT:    vmovaps %ymm1, (%r9)
4991; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4992; AVX-NEXT:    vmovaps %ymm4, (%rax)
4993; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4994; AVX-NEXT:    vmovaps %ymm0, (%rax)
4995; AVX-NEXT:    addq $200, %rsp
4996; AVX-NEXT:    vzeroupper
4997; AVX-NEXT:    retq
4998;
4999; AVX2-LABEL: load_i8_stride7_vf32:
5000; AVX2:       # %bb.0:
5001; AVX2-NEXT:    subq $72, %rsp
5002; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm10
5003; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm11
5004; AVX2-NEXT:    vmovdqa (%rdi), %ymm6
5005; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm7
5006; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm13
5007; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm3
5008; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
5009; AVX2-NEXT:    vpblendvb %ymm15, %ymm6, %ymm7, %ymm0
5010; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
5011; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u]
5012; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
5013; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
5014; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
5015; AVX2-NEXT:    vpblendvb %ymm14, %ymm3, %ymm13, %ymm1
5016; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
5017; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
5018; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
5019; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0]
5020; AVX2-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
5021; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
5022; AVX2-NEXT:    vpblendvb %ymm1, %ymm10, %ymm11, %ymm1
5023; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
5024; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
5025; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
5026; AVX2-NEXT:    vpor %xmm4, %xmm1, %xmm1
5027; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
5028; AVX2-NEXT:    vpblendvb %ymm2, %ymm6, %ymm7, %ymm4
5029; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
5030; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u]
5031; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u]
5032; AVX2-NEXT:    vpor %xmm5, %xmm4, %xmm4
5033; AVX2-NEXT:    vpblendvb %ymm15, %ymm13, %ymm3, %ymm5
5034; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm8
5035; AVX2-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4,5],ymm8[6],ymm5[7,8,9],ymm8[10],ymm5[11,12,13],ymm8[14],ymm5[15]
5036; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
5037; AVX2-NEXT:    vpmovsxdq {{.*#+}} ymm8 = [0,18446744073709551360,16777215,0]
5038; AVX2-NEXT:    vpblendvb %ymm8, %ymm5, %ymm4, %ymm8
5039; AVX2-NEXT:    vmovdqa 192(%rdi), %xmm4
5040; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u]
5041; AVX2-NEXT:    vmovdqa 208(%rdi), %xmm5
5042; AVX2-NEXT:    vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
5043; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3]
5044; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
5045; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
5046; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7]
5047; AVX2-NEXT:    vpmovsxdq {{.*#+}} ymm9 = [18446744073709551615,18446744073709551615,16777215,0]
5048; AVX2-NEXT:    vpblendvb %ymm9, %ymm0, %ymm1, %ymm0
5049; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5050; AVX2-NEXT:    vpblendvb %ymm14, %ymm10, %ymm11, %ymm0
5051; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u]
5052; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
5053; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u]
5054; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
5055; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
5056; AVX2-NEXT:    vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
5057; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
5058; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
5059; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
5060; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
5061; AVX2-NEXT:    vpblendvb %ymm9, %ymm8, %ymm0, %ymm0
5062; AVX2-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
5063; AVX2-NEXT:    vpblendvb %ymm2, %ymm11, %ymm10, %ymm0
5064; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
5065; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u]
5066; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u]
5067; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
5068; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12]
5069; AVX2-NEXT:    vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
5070; AVX2-NEXT:    vpor %xmm1, %xmm8, %xmm1
5071; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm8
5072; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
5073; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
5074; AVX2-NEXT:    # ymm0 = mem[0,1,0,1]
5075; AVX2-NEXT:    vpblendvb %ymm0, %ymm8, %ymm1, %ymm1
5076; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5077; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
5078; AVX2-NEXT:    vpblendvb %ymm9, %ymm11, %ymm10, %ymm1
5079; AVX2-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u]
5080; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
5081; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u]
5082; AVX2-NEXT:    vpor %xmm1, %xmm8, %xmm1
5083; AVX2-NEXT:    vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13]
5084; AVX2-NEXT:    vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
5085; AVX2-NEXT:    vpor %xmm8, %xmm12, %xmm8
5086; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
5087; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
5088; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm8, %ymm1
5089; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5090; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535]
5091; AVX2-NEXT:    vpblendvb %ymm8, %ymm11, %ymm10, %ymm1
5092; AVX2-NEXT:    vpshufb {{.*#+}} xmm12 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u]
5093; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
5094; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u]
5095; AVX2-NEXT:    vpor %xmm1, %xmm12, %xmm1
5096; AVX2-NEXT:    vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
5097; AVX2-NEXT:    vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14]
5098; AVX2-NEXT:    vpor %xmm12, %xmm14, %xmm12
5099; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
5100; AVX2-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
5101; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm12, %ymm1
5102; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5103; AVX2-NEXT:    vpblendvb %ymm15, %ymm11, %ymm10, %ymm14
5104; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
5105; AVX2-NEXT:    vpblendvb %ymm12, %ymm10, %ymm11, %ymm10
5106; AVX2-NEXT:    vextracti128 $1, %ymm10, %xmm11
5107; AVX2-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u]
5108; AVX2-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u]
5109; AVX2-NEXT:    vpor %xmm11, %xmm10, %xmm10
5110; AVX2-NEXT:    vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
5111; AVX2-NEXT:    vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15]
5112; AVX2-NEXT:    vpor %xmm11, %xmm15, %xmm11
5113; AVX2-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
5114; AVX2-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
5115; AVX2-NEXT:    vpblendvb %ymm0, %ymm10, %ymm11, %ymm0
5116; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5117; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
5118; AVX2-NEXT:    vpblendvb %ymm0, %ymm7, %ymm6, %ymm10
5119; AVX2-NEXT:    vpblendvb %ymm9, %ymm6, %ymm7, %ymm11
5120; AVX2-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm15
5121; AVX2-NEXT:    vpblendvb %ymm12, %ymm7, %ymm6, %ymm1
5122; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
5123; AVX2-NEXT:    vpblendvb %ymm2, %ymm7, %ymm6, %ymm6
5124; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
5125; AVX2-NEXT:    vpblendvb %ymm0, %ymm13, %ymm3, %ymm0
5126; AVX2-NEXT:    vpblendvb %ymm9, %ymm13, %ymm3, %ymm9
5127; AVX2-NEXT:    vpblendvb %ymm8, %ymm13, %ymm3, %ymm8
5128; AVX2-NEXT:    vpblendvb %ymm12, %ymm3, %ymm13, %ymm7
5129; AVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm13, %ymm2
5130; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u]
5131; AVX2-NEXT:    vextracti128 $1, %ymm11, %xmm11
5132; AVX2-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u]
5133; AVX2-NEXT:    vpor %xmm3, %xmm11, %xmm3
5134; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm11
5135; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7,8,9,10],ymm11[11],ymm0[12,13],ymm11[14],ymm0[15]
5136; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5137; AVX2-NEXT:    vpmovsxwq {{.*#+}} xmm11 = [18446744073709551615,255]
5138; AVX2-NEXT:    vpblendvb %ymm11, %ymm3, %ymm0, %ymm0
5139; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u]
5140; AVX2-NEXT:    vextracti128 $1, %ymm15, %xmm13
5141; AVX2-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u,u,u]
5142; AVX2-NEXT:    vpor %xmm3, %xmm13, %xmm3
5143; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm13
5144; AVX2-NEXT:    vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1,2],ymm13[3],ymm9[4,5,6],ymm13[7,8],ymm9[9,10],ymm13[11],ymm9[12,13,14],ymm13[15]
5145; AVX2-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5146; AVX2-NEXT:    vpblendvb %ymm11, %ymm3, %ymm9, %ymm3
5147; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm9
5148; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u]
5149; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u]
5150; AVX2-NEXT:    vpor %xmm1, %xmm9, %xmm1
5151; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm9
5152; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3],ymm9[4],ymm8[5,6],ymm9[7,8],ymm8[9,10,11],ymm9[12],ymm8[13,14],ymm9[15]
5153; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5154; AVX2-NEXT:    vpblendvb %ymm11, %ymm1, %ymm8, %ymm9
5155; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm1
5156; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u]
5157; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u]
5158; AVX2-NEXT:    vpor %xmm1, %xmm6, %xmm1
5159; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm6
5160; AVX2-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15]
5161; AVX2-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5162; AVX2-NEXT:    vpblendvb %ymm11, %ymm1, %ymm6, %ymm1
5163; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u]
5164; AVX2-NEXT:    vextracti128 $1, %ymm10, %xmm7
5165; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u]
5166; AVX2-NEXT:    vpor %xmm6, %xmm7, %xmm6
5167; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm7
5168; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7,8],ymm7[9],ymm2[10,11,12],ymm7[13],ymm2[14,15]
5169; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5170; AVX2-NEXT:    vpblendvb %ymm11, %ymm6, %ymm2, %ymm2
5171; AVX2-NEXT:    vextracti128 $1, %ymm14, %xmm6
5172; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u]
5173; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u]
5174; AVX2-NEXT:    vpor %xmm6, %xmm7, %xmm6
5175; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u]
5176; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
5177; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
5178; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm5
5179; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
5180; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
5181; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15]
5182; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
5183; AVX2-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
5184; AVX2-NEXT:    # ymm4 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15]
5185; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
5186; AVX2-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload
5187; AVX2-NEXT:    # ymm4 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15]
5188; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
5189; AVX2-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
5190; AVX2-NEXT:    # ymm5 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15]
5191; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
5192; AVX2-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
5193; AVX2-NEXT:    # ymm5 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15]
5194; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
5195; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5196; AVX2-NEXT:    vmovaps %ymm5, (%rsi)
5197; AVX2-NEXT:    vmovups (%rsp), %ymm5 # 32-byte Reload
5198; AVX2-NEXT:    vmovaps %ymm5, (%rdx)
5199; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
5200; AVX2-NEXT:    vmovdqa %ymm3, (%r8)
5201; AVX2-NEXT:    vmovdqa %ymm4, (%r9)
5202; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5203; AVX2-NEXT:    vmovdqa %ymm1, (%rax)
5204; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5205; AVX2-NEXT:    vmovdqa %ymm2, (%rax)
5206; AVX2-NEXT:    addq $72, %rsp
5207; AVX2-NEXT:    vzeroupper
5208; AVX2-NEXT:    retq
5209;
5210; AVX2-FP-LABEL: load_i8_stride7_vf32:
5211; AVX2-FP:       # %bb.0:
5212; AVX2-FP-NEXT:    subq $72, %rsp
5213; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm10
5214; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm11
5215; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm6
5216; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm7
5217; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm13
5218; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm3
5219; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
5220; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm6, %ymm7, %ymm0
5221; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm1
5222; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u]
5223; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
5224; AVX2-FP-NEXT:    vpor %xmm1, %xmm0, %xmm0
5225; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
5226; AVX2-FP-NEXT:    vpblendvb %ymm14, %ymm3, %ymm13, %ymm1
5227; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm4
5228; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
5229; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
5230; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0]
5231; AVX2-FP-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
5232; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
5233; AVX2-FP-NEXT:    vpblendvb %ymm1, %ymm10, %ymm11, %ymm1
5234; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm4
5235; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
5236; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
5237; AVX2-FP-NEXT:    vpor %xmm4, %xmm1, %xmm1
5238; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
5239; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm6, %ymm7, %ymm4
5240; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm5
5241; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u]
5242; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u]
5243; AVX2-FP-NEXT:    vpor %xmm5, %xmm4, %xmm4
5244; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm13, %ymm3, %ymm5
5245; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm8
5246; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4,5],ymm8[6],ymm5[7,8,9],ymm8[10],ymm5[11,12,13],ymm8[14],ymm5[15]
5247; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
5248; AVX2-FP-NEXT:    vpmovsxdq {{.*#+}} ymm8 = [0,18446744073709551360,16777215,0]
5249; AVX2-FP-NEXT:    vpblendvb %ymm8, %ymm5, %ymm4, %ymm8
5250; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %xmm4
5251; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u]
5252; AVX2-FP-NEXT:    vmovdqa 208(%rdi), %xmm5
5253; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
5254; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3]
5255; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
5256; AVX2-FP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
5257; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7]
5258; AVX2-FP-NEXT:    vpmovsxdq {{.*#+}} ymm9 = [18446744073709551615,18446744073709551615,16777215,0]
5259; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm0, %ymm1, %ymm0
5260; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5261; AVX2-FP-NEXT:    vpblendvb %ymm14, %ymm10, %ymm11, %ymm0
5262; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u]
5263; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm0
5264; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u]
5265; AVX2-FP-NEXT:    vpor %xmm1, %xmm0, %xmm0
5266; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
5267; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
5268; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
5269; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
5270; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
5271; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
5272; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm8, %ymm0, %ymm0
5273; AVX2-FP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
5274; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm11, %ymm10, %ymm0
5275; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm1
5276; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u]
5277; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u]
5278; AVX2-FP-NEXT:    vpor %xmm1, %xmm0, %xmm0
5279; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12]
5280; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
5281; AVX2-FP-NEXT:    vpor %xmm1, %xmm8, %xmm1
5282; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm8
5283; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
5284; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
5285; AVX2-FP-NEXT:    # ymm0 = mem[0,1,0,1]
5286; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm8, %ymm1, %ymm1
5287; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5288; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
5289; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm11, %ymm10, %ymm1
5290; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u]
5291; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm1
5292; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u]
5293; AVX2-FP-NEXT:    vpor %xmm1, %xmm8, %xmm1
5294; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13]
5295; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
5296; AVX2-FP-NEXT:    vpor %xmm8, %xmm12, %xmm8
5297; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
5298; AVX2-FP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
5299; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm8, %ymm1
5300; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5301; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535]
5302; AVX2-FP-NEXT:    vpblendvb %ymm8, %ymm11, %ymm10, %ymm1
5303; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u]
5304; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm1
5305; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u]
5306; AVX2-FP-NEXT:    vpor %xmm1, %xmm12, %xmm1
5307; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
5308; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14]
5309; AVX2-FP-NEXT:    vpor %xmm12, %xmm14, %xmm12
5310; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
5311; AVX2-FP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
5312; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm12, %ymm1
5313; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5314; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm11, %ymm10, %ymm14
5315; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
5316; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm10, %ymm11, %ymm10
5317; AVX2-FP-NEXT:    vextracti128 $1, %ymm10, %xmm11
5318; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u]
5319; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u]
5320; AVX2-FP-NEXT:    vpor %xmm11, %xmm10, %xmm10
5321; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
5322; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15]
5323; AVX2-FP-NEXT:    vpor %xmm11, %xmm15, %xmm11
5324; AVX2-FP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
5325; AVX2-FP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
5326; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm10, %ymm11, %ymm0
5327; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5328; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
5329; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm7, %ymm6, %ymm10
5330; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm6, %ymm7, %ymm11
5331; AVX2-FP-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm15
5332; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm7, %ymm6, %ymm1
5333; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
5334; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm7, %ymm6, %ymm6
5335; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
5336; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm13, %ymm3, %ymm0
5337; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm13, %ymm3, %ymm9
5338; AVX2-FP-NEXT:    vpblendvb %ymm8, %ymm13, %ymm3, %ymm8
5339; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm3, %ymm13, %ymm7
5340; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm3, %ymm13, %ymm2
5341; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u]
5342; AVX2-FP-NEXT:    vextracti128 $1, %ymm11, %xmm11
5343; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u]
5344; AVX2-FP-NEXT:    vpor %xmm3, %xmm11, %xmm3
5345; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm11
5346; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7,8,9,10],ymm11[11],ymm0[12,13],ymm11[14],ymm0[15]
5347; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5348; AVX2-FP-NEXT:    vpmovsxwq {{.*#+}} xmm11 = [18446744073709551615,255]
5349; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm3, %ymm0, %ymm0
5350; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u]
5351; AVX2-FP-NEXT:    vextracti128 $1, %ymm15, %xmm13
5352; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u,u,u]
5353; AVX2-FP-NEXT:    vpor %xmm3, %xmm13, %xmm3
5354; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm13
5355; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1,2],ymm13[3],ymm9[4,5,6],ymm13[7,8],ymm9[9,10],ymm13[11],ymm9[12,13,14],ymm13[15]
5356; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5357; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm3, %ymm9, %ymm3
5358; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm9
5359; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u]
5360; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u]
5361; AVX2-FP-NEXT:    vpor %xmm1, %xmm9, %xmm1
5362; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm9
5363; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3],ymm9[4],ymm8[5,6],ymm9[7,8],ymm8[9,10,11],ymm9[12],ymm8[13,14],ymm9[15]
5364; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5365; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm1, %ymm8, %ymm9
5366; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm1
5367; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u]
5368; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u]
5369; AVX2-FP-NEXT:    vpor %xmm1, %xmm6, %xmm1
5370; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm6
5371; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15]
5372; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5373; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm1, %ymm6, %ymm1
5374; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u]
5375; AVX2-FP-NEXT:    vextracti128 $1, %ymm10, %xmm7
5376; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u]
5377; AVX2-FP-NEXT:    vpor %xmm6, %xmm7, %xmm6
5378; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm7
5379; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7,8],ymm7[9],ymm2[10,11,12],ymm7[13],ymm2[14,15]
5380; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5381; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm6, %ymm2, %ymm2
5382; AVX2-FP-NEXT:    vextracti128 $1, %ymm14, %xmm6
5383; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u]
5384; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u]
5385; AVX2-FP-NEXT:    vpor %xmm6, %xmm7, %xmm6
5386; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u]
5387; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
5388; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
5389; AVX2-FP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm5
5390; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
5391; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
5392; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15]
5393; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
5394; AVX2-FP-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
5395; AVX2-FP-NEXT:    # ymm4 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15]
5396; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
5397; AVX2-FP-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload
5398; AVX2-FP-NEXT:    # ymm4 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15]
5399; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
5400; AVX2-FP-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
5401; AVX2-FP-NEXT:    # ymm5 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15]
5402; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
5403; AVX2-FP-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
5404; AVX2-FP-NEXT:    # ymm5 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15]
5405; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
5406; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5407; AVX2-FP-NEXT:    vmovaps %ymm5, (%rsi)
5408; AVX2-FP-NEXT:    vmovups (%rsp), %ymm5 # 32-byte Reload
5409; AVX2-FP-NEXT:    vmovaps %ymm5, (%rdx)
5410; AVX2-FP-NEXT:    vmovdqa %ymm0, (%rcx)
5411; AVX2-FP-NEXT:    vmovdqa %ymm3, (%r8)
5412; AVX2-FP-NEXT:    vmovdqa %ymm4, (%r9)
5413; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5414; AVX2-FP-NEXT:    vmovdqa %ymm1, (%rax)
5415; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5416; AVX2-FP-NEXT:    vmovdqa %ymm2, (%rax)
5417; AVX2-FP-NEXT:    addq $72, %rsp
5418; AVX2-FP-NEXT:    vzeroupper
5419; AVX2-FP-NEXT:    retq
5420;
5421; AVX2-FCP-LABEL: load_i8_stride7_vf32:
5422; AVX2-FCP:       # %bb.0:
5423; AVX2-FCP-NEXT:    subq $40, %rsp
5424; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm10
5425; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm8
5426; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm9
5427; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm5
5428; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm6
5429; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm3
5430; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm4
5431; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
5432; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm5, %ymm6, %ymm0
5433; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
5434; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u]
5435; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
5436; AVX2-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
5437; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
5438; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm4, %ymm3, %ymm1
5439; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
5440; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
5441; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
5442; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0]
5443; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
5444; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
5445; AVX2-FCP-NEXT:    vpblendvb %ymm1, %ymm8, %ymm9, %ymm1
5446; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
5447; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u]
5448; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
5449; AVX2-FCP-NEXT:    vpor %xmm2, %xmm1, %xmm1
5450; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
5451; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,2,4,6]
5452; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm2, %ymm2
5453; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5454; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
5455; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
5456; AVX2-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm2 = [18446744073709551615,18446744073709551615,16777215,0]
5457; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
5458; AVX2-FCP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
5459; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
5460; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm5, %ymm6, %ymm0
5461; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
5462; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u,u,u]
5463; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u,u,u]
5464; AVX2-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
5465; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm3, %ymm4, %ymm1
5466; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm7
5467; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm7[2],ymm1[3,4,5],ymm7[6],ymm1[7,8,9],ymm7[10],ymm1[11,12,13],ymm7[14],ymm1[15]
5468; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
5469; AVX2-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm7 = [0,18446744073709551360,16777215,0]
5470; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
5471; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm8, %ymm9, %ymm1
5472; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u]
5473; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
5474; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u]
5475; AVX2-FCP-NEXT:    vpor %xmm7, %xmm1, %xmm1
5476; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
5477; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,3,4,6]
5478; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm7, %ymm7
5479; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
5480; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7]
5481; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
5482; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5483; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm9, %ymm8, %ymm0
5484; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
5485; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u]
5486; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u]
5487; AVX2-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm1
5488; AVX2-FCP-NEXT:    vmovdqa 208(%rdi), %xmm2
5489; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12]
5490; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %xmm0
5491; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
5492; AVX2-FCP-NEXT:    vpor %xmm7, %xmm10, %xmm7
5493; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm10
5494; AVX2-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
5495; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
5496; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
5497; AVX2-FCP-NEXT:    vpblendvb %ymm1, %ymm10, %ymm7, %ymm7
5498; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5499; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
5500; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm9, %ymm8, %ymm7
5501; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm7[u,u,2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u]
5502; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
5503; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u]
5504; AVX2-FCP-NEXT:    vpor %xmm7, %xmm10, %xmm7
5505; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13]
5506; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
5507; AVX2-FCP-NEXT:    vpor %xmm10, %xmm14, %xmm10
5508; AVX2-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
5509; AVX2-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
5510; AVX2-FCP-NEXT:    vpblendvb %ymm1, %ymm7, %ymm10, %ymm7
5511; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5512; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535]
5513; AVX2-FCP-NEXT:    vpblendvb %ymm10, %ymm9, %ymm8, %ymm7
5514; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm7[u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u]
5515; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
5516; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u]
5517; AVX2-FCP-NEXT:    vpor %xmm7, %xmm15, %xmm7
5518; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
5519; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14]
5520; AVX2-FCP-NEXT:    vpor %xmm15, %xmm12, %xmm12
5521; AVX2-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
5522; AVX2-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
5523; AVX2-FCP-NEXT:    vpblendvb %ymm1, %ymm7, %ymm12, %ymm15
5524; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
5525; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm9, %ymm8, %ymm12
5526; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
5527; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm8, %ymm9, %ymm8
5528; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
5529; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u]
5530; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u]
5531; AVX2-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm8
5532; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
5533; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15]
5534; AVX2-FCP-NEXT:    vpor %xmm0, %xmm2, %xmm0
5535; AVX2-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm2
5536; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
5537; AVX2-FCP-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm14
5538; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm6, %ymm5, %ymm11
5539; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm5, %ymm6, %ymm2
5540; AVX2-FCP-NEXT:    vpblendvb %ymm10, %ymm5, %ymm6, %ymm8
5541; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm6, %ymm5, %ymm9
5542; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
5543; AVX2-FCP-NEXT:    vpblendvb %ymm1, %ymm6, %ymm5, %ymm5
5544; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
5545; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm3, %ymm4, %ymm0
5546; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm3, %ymm4, %ymm13
5547; AVX2-FCP-NEXT:    vpblendvb %ymm10, %ymm3, %ymm4, %ymm10
5548; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm4, %ymm3, %ymm6
5549; AVX2-FCP-NEXT:    vpblendvb %ymm1, %ymm4, %ymm3, %ymm3
5550; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u]
5551; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm2
5552; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u]
5553; AVX2-FCP-NEXT:    vpor %xmm4, %xmm2, %xmm2
5554; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm4
5555; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8,9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
5556; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5557; AVX2-FCP-NEXT:    vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255]
5558; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm2, %ymm0, %ymm2
5559; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u]
5560; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm4
5561; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u,u,u]
5562; AVX2-FCP-NEXT:    vpor %xmm0, %xmm4, %xmm0
5563; AVX2-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm4
5564; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm13[1,2],ymm4[3],ymm13[4,5,6],ymm4[7,8],ymm13[9,10],ymm4[11],ymm13[12,13,14],ymm4[15]
5565; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5566; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm0, %ymm4, %ymm4
5567; AVX2-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm0
5568; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[2,9],zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u]
5569; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm9[4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u,u,u]
5570; AVX2-FCP-NEXT:    vpor %xmm0, %xmm8, %xmm0
5571; AVX2-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm8
5572; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0],ymm10[1,2,3],ymm8[4],ymm10[5,6],ymm8[7,8],ymm10[9,10,11],ymm8[12],ymm10[13,14],ymm8[15]
5573; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5574; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm0, %ymm8, %ymm0
5575; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm8
5576; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u]
5577; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[5,12],zero,zero,xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u]
5578; AVX2-FCP-NEXT:    vpor %xmm5, %xmm8, %xmm5
5579; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm8
5580; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6,7,8],ymm8[9],ymm6[10,11],ymm8[12],ymm6[13,14,15]
5581; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5582; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
5583; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u]
5584; AVX2-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm1
5585; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
5586; AVX2-FCP-NEXT:    vpor %xmm6, %xmm1, %xmm1
5587; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm6
5588; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7,8],ymm6[9],ymm3[10,11,12],ymm6[13],ymm3[14,15]
5589; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5590; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm1, %ymm3, %ymm1
5591; AVX2-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm3
5592; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u]
5593; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
5594; AVX2-FCP-NEXT:    vpor %xmm3, %xmm6, %xmm3
5595; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,1,3,5,6]
5596; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
5597; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
5598; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
5599; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7]
5600; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15]
5601; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
5602; AVX2-FCP-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload
5603; AVX2-FCP-NEXT:    # ymm3 = ymm4[0],mem[1,2,3,4,5,6,7],ymm4[8],mem[9,10,11,12,13,14,15]
5604; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
5605; AVX2-FCP-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
5606; AVX2-FCP-NEXT:    # ymm4 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15]
5607; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
5608; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm15[1,2,3,4,5,6,7],ymm5[8],ymm15[9,10,11,12,13,14,15]
5609; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
5610; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm1[0],ymm14[1,2,3,4,5,6,7],ymm1[8],ymm14[9,10,11,12,13,14,15]
5611; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
5612; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm5 # 32-byte Reload
5613; AVX2-FCP-NEXT:    vmovaps %ymm5, (%rsi)
5614; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5615; AVX2-FCP-NEXT:    vmovaps %ymm5, (%rdx)
5616; AVX2-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
5617; AVX2-FCP-NEXT:    vmovdqa %ymm3, (%r8)
5618; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%r9)
5619; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5620; AVX2-FCP-NEXT:    vmovdqa %ymm4, (%rax)
5621; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5622; AVX2-FCP-NEXT:    vmovdqa %ymm1, (%rax)
5623; AVX2-FCP-NEXT:    addq $40, %rsp
5624; AVX2-FCP-NEXT:    vzeroupper
5625; AVX2-FCP-NEXT:    retq
5626;
5627; AVX512-LABEL: load_i8_stride7_vf32:
5628; AVX512:       # %bb.0:
5629; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5630; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
5631; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
5632; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm2
5633; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm3
5634; AVX512-NEXT:    vmovdqa %ymm0, %ymm1
5635; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm3 ^ ymm2))
5636; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm4
5637; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
5638; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
5639; AVX512-NEXT:    vpor %xmm4, %xmm1, %xmm1
5640; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
5641; AVX512-NEXT:    vmovdqa 192(%rdi), %xmm4
5642; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
5643; AVX512-NEXT:    vpshufb %xmm5, %xmm4, %xmm6
5644; AVX512-NEXT:    vmovdqa64 %xmm5, %xmm20
5645; AVX512-NEXT:    vmovdqa 208(%rdi), %xmm5
5646; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
5647; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
5648; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
5649; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5,6],ymm6[7]
5650; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
5651; AVX512-NEXT:    vmovdqa (%rdi), %ymm6
5652; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm7
5653; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm1
5654; AVX512-NEXT:    vmovdqa %ymm14, %ymm9
5655; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm7 ^ (ymm9 & (ymm6 ^ ymm7))
5656; AVX512-NEXT:    vextracti128 $1, %ymm9, %xmm10
5657; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u,u,u,u,u]
5658; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[0,7,14],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[u,u,u,u,u,u]
5659; AVX512-NEXT:    vpor %xmm10, %xmm9, %xmm13
5660; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
5661; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm9
5662; AVX512-NEXT:    vmovdqa %ymm11, %ymm15
5663; AVX512-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm9 ^ ymm1))
5664; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm10
5665; AVX512-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15]
5666; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
5667; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm13 & mem)
5668; AVX512-NEXT:    vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
5669; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm12 ^ (ymm16 & (ymm8 ^ ymm12))
5670; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm18
5671; AVX512-NEXT:    vmovdqa %ymm11, %ymm12
5672; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm2 ^ (ymm12 & (ymm3 ^ ymm2))
5673; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,6,13],zero,zero,xmm12[2,9],zero,zero,zero,xmm12[u,u,u,u]
5674; AVX512-NEXT:    vextracti128 $1, %ymm12, %xmm12
5675; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[4,11],zero,zero,xmm12[0,7,14,u,u,u,u]
5676; AVX512-NEXT:    vpor %xmm13, %xmm12, %xmm12
5677; AVX512-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
5678; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
5679; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
5680; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3]
5681; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
5682; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7]
5683; AVX512-NEXT:    vmovdqa {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
5684; AVX512-NEXT:    vmovdqa %ymm13, %ymm12
5685; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm7 ^ (ymm12 & (ymm6 ^ ymm7))
5686; AVX512-NEXT:    vextracti128 $1, %ymm12, %xmm15
5687; AVX512-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u]
5688; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[1,8,15],zero,zero,xmm12[4,11],zero,zero,xmm12[u,u,u,u,u,u,u]
5689; AVX512-NEXT:    vpor %xmm15, %xmm12, %xmm15
5690; AVX512-NEXT:    vmovdqa %ymm14, %ymm12
5691; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm9 ^ (ymm12 & (ymm1 ^ ymm9))
5692; AVX512-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7,8,9],ymm10[10],ymm12[11,12,13],ymm10[14],ymm12[15]
5693; AVX512-NEXT:    vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
5694; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 | (ymm15 & ~mem)
5695; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm8 ^ (ymm16 & (ymm12 ^ ymm8))
5696; AVX512-NEXT:    vmovdqa64 %ymm12, %ymm19
5697; AVX512-NEXT:    vmovdqa %ymm0, %ymm8
5698; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm8 & (ymm6 ^ ymm7))
5699; AVX512-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u]
5700; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm8
5701; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u]
5702; AVX512-NEXT:    vpor %xmm15, %xmm8, %xmm8
5703; AVX512-NEXT:    vmovdqa %ymm13, %ymm15
5704; AVX512-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm9 ^ (ymm15 & (ymm1 ^ ymm9))
5705; AVX512-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8,9,10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15]
5706; AVX512-NEXT:    vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5707; AVX512-NEXT:    vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
5708; AVX512-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm8 & ymm17)
5709; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm3 ^ (ymm14 & (ymm2 ^ ymm3))
5710; AVX512-NEXT:    vextracti128 $1, %ymm14, %xmm8
5711; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u]
5712; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u]
5713; AVX512-NEXT:    vpor %xmm8, %xmm14, %xmm8
5714; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
5715; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm12
5716; AVX512-NEXT:    vpshufb %xmm12, %xmm5, %xmm14
5717; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
5718; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3]
5719; AVX512-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
5720; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7]
5721; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm15[0],ymm8[1,2,3,4,5,6,7],ymm15[8],ymm8[9,10,11,12,13,14,15]
5722; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7]
5723; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm20
5724; AVX512-NEXT:    vmovdqa %ymm13, %ymm8
5725; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3))
5726; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm12
5727; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u]
5728; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u]
5729; AVX512-NEXT:    vpor %xmm12, %xmm8, %xmm8
5730; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
5731; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12]
5732; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
5733; AVX512-NEXT:    vpor %xmm12, %xmm14, %xmm12
5734; AVX512-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
5735; AVX512-NEXT:    vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
5736; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8))
5737; AVX512-NEXT:    vmovdqa %ymm11, %ymm8
5738; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm8 & (ymm6 ^ ymm7))
5739; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u]
5740; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm8
5741; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u]
5742; AVX512-NEXT:    vpor %xmm14, %xmm8, %xmm8
5743; AVX512-NEXT:    vmovdqa %ymm0, %ymm14
5744; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm1 ^ ymm9))
5745; AVX512-NEXT:    vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2],ymm10[3],ymm14[4,5,6],ymm10[7,8],ymm14[9,10],ymm10[11],ymm14[12,13,14],ymm10[15]
5746; AVX512-NEXT:    vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5747; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm8 & ymm17)
5748; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15]
5749; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
5750; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm21
5751; AVX512-NEXT:    vmovdqa %ymm0, %ymm8
5752; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3))
5753; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u]
5754; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm8
5755; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u]
5756; AVX512-NEXT:    vpor %xmm12, %xmm8, %xmm8
5757; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
5758; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13]
5759; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
5760; AVX512-NEXT:    vpor %xmm12, %xmm14, %xmm12
5761; AVX512-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
5762; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8))
5763; AVX512-NEXT:    vmovdqa %ymm13, %ymm8
5764; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm7 ^ ymm6))
5765; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm14
5766; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u]
5767; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u]
5768; AVX512-NEXT:    vpor %xmm14, %xmm8, %xmm8
5769; AVX512-NEXT:    vmovdqa %ymm11, %ymm14
5770; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm1 ^ ymm9))
5771; AVX512-NEXT:    vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2,3],ymm10[4],ymm14[5,6],ymm10[7,8],ymm14[9,10,11],ymm10[12],ymm14[13,14],ymm10[15]
5772; AVX512-NEXT:    vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5773; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm8 & ymm17)
5774; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15]
5775; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm8[4,5,6,7]
5776; AVX512-NEXT:    vmovdqa %ymm11, %ymm8
5777; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3))
5778; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u]
5779; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm8
5780; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u]
5781; AVX512-NEXT:    vpor %xmm12, %xmm8, %xmm8
5782; AVX512-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
5783; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
5784; AVX512-NEXT:    vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14]
5785; AVX512-NEXT:    vpor %xmm12, %xmm15, %xmm12
5786; AVX512-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
5787; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8))
5788; AVX512-NEXT:    vmovdqa %ymm0, %ymm8
5789; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm7 ^ ymm6))
5790; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm15
5791; AVX512-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
5792; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u]
5793; AVX512-NEXT:    vpor %xmm15, %xmm8, %xmm8
5794; AVX512-NEXT:    vmovdqa %ymm13, %ymm15
5795; AVX512-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm9 ^ ymm1))
5796; AVX512-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6,7,8],ymm10[9],ymm15[10,11],ymm10[12],ymm15[13,14,15]
5797; AVX512-NEXT:    vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5798; AVX512-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm8 & ymm17)
5799; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm15[0],ymm12[1,2,3,4,5,6,7],ymm15[8],ymm12[9,10,11,12,13,14,15]
5800; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7]
5801; AVX512-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm2 ^ (ymm13 & (ymm3 ^ ymm2))
5802; AVX512-NEXT:    vextracti128 $1, %ymm13, %xmm2
5803; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u]
5804; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm13[u,u,4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u]
5805; AVX512-NEXT:    vpor %xmm2, %xmm3, %xmm2
5806; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
5807; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
5808; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15]
5809; AVX512-NEXT:    vpor %xmm3, %xmm4, %xmm3
5810; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
5811; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm16 & (ymm3 ^ ymm2))
5812; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm6 ^ (ymm11 & (ymm7 ^ ymm6))
5813; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u]
5814; AVX512-NEXT:    vextracti128 $1, %ymm11, %xmm4
5815; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u]
5816; AVX512-NEXT:    vpor %xmm2, %xmm4, %xmm2
5817; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm9 ^ ymm1))
5818; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7,8],ymm10[9],ymm0[10,11,12],ymm10[13],ymm0[14,15]
5819; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5820; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm17)
5821; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
5822; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5823; AVX512-NEXT:    vmovdqa64 %ymm18, (%rsi)
5824; AVX512-NEXT:    vmovdqa64 %ymm19, (%rdx)
5825; AVX512-NEXT:    vmovdqa64 %ymm20, (%rcx)
5826; AVX512-NEXT:    vmovdqa64 %ymm21, (%r8)
5827; AVX512-NEXT:    vmovdqa %ymm14, (%r9)
5828; AVX512-NEXT:    vmovdqa %ymm8, (%r10)
5829; AVX512-NEXT:    vmovdqa %ymm0, (%rax)
5830; AVX512-NEXT:    vzeroupper
5831; AVX512-NEXT:    retq
5832;
5833; AVX512-FCP-LABEL: load_i8_stride7_vf32:
5834; AVX512-FCP:       # %bb.0:
5835; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5836; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
5837; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
5838; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm2
5839; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm3
5840; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm1
5841; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm3 ^ ymm2))
5842; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
5843; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
5844; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
5845; AVX512-FCP-NEXT:    vpor %xmm4, %xmm1, %xmm1
5846; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
5847; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,2,4,6]
5848; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm12
5849; AVX512-FCP-NEXT:    vpermd %ymm12, %ymm4, %ymm4
5850; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
5851; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5,6],ymm4[7]
5852; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
5853; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm4
5854; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
5855; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm1
5856; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm7
5857; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm4 ^ ymm5))
5858; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
5859; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u,u,u]
5860; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u,u,u]
5861; AVX512-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm10
5862; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
5863; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm7
5864; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm11
5865; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm1 ^ (ymm11 & (ymm7 ^ ymm1))
5866; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm8
5867; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7,8,9],ymm8[10],ymm11[11,12],ymm8[13],ymm11[14,15]
5868; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
5869; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm10 & mem)
5870; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
5871; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm6 ^ (ymm16 & (ymm11 ^ ymm6))
5872; AVX512-FCP-NEXT:    vmovdqa64 %ymm11, %ymm18
5873; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm6
5874; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm3 ^ ymm2))
5875; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u]
5876; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
5877; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u]
5878; AVX512-FCP-NEXT:    vpor %xmm6, %xmm10, %xmm6
5879; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
5880; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,3,4,6]
5881; AVX512-FCP-NEXT:    vpermd %ymm12, %ymm10, %ymm10
5882; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
5883; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
5884; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
5885; AVX512-FCP-NEXT:    vmovdqa %ymm11, %ymm10
5886; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm10 & (ymm4 ^ ymm5))
5887; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm14
5888; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u,u,u]
5889; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[1,8,15],zero,zero,xmm10[4,11],zero,zero,xmm10[u,u,u,u,u,u,u]
5890; AVX512-FCP-NEXT:    vpor %xmm14, %xmm10, %xmm14
5891; AVX512-FCP-NEXT:    vmovdqa %ymm13, %ymm10
5892; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm7 ^ (ymm10 & (ymm1 ^ ymm7))
5893; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm8[2],ymm10[3,4,5],ymm8[6],ymm10[7,8,9],ymm8[10],ymm10[11,12,13],ymm8[14],ymm10[15]
5894; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
5895; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm14 & ~mem)
5896; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm16 & (ymm10 ^ ymm6))
5897; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm6
5898; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5))
5899; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u]
5900; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
5901; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u]
5902; AVX512-FCP-NEXT:    vpor %xmm6, %xmm14, %xmm6
5903; AVX512-FCP-NEXT:    vmovdqa %ymm11, %ymm14
5904; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm7 ^ (ymm14 & (ymm1 ^ ymm7))
5905; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7,8,9,10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15]
5906; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5907; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
5908; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm6 & ymm17)
5909; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm2 ^ ymm3))
5910; AVX512-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm6
5911; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u]
5912; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u]
5913; AVX512-FCP-NEXT:    vpor %xmm6, %xmm13, %xmm6
5914; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
5915; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,5,6]
5916; AVX512-FCP-NEXT:    vpermd %ymm12, %ymm13, %ymm12
5917; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
5918; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7]
5919; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1,2,3,4,5,6,7],ymm14[8],ymm6[9,10,11,12,13,14,15]
5920; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5,6,7]
5921; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm19
5922; AVX512-FCP-NEXT:    vmovdqa %ymm11, %ymm6
5923; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3))
5924; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm12
5925; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u]
5926; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u]
5927; AVX512-FCP-NEXT:    vpor %xmm6, %xmm12, %xmm6
5928; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
5929; AVX512-FCP-NEXT:    vmovdqa 208(%rdi), %xmm14
5930; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,12]
5931; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %xmm12
5932; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
5933; AVX512-FCP-NEXT:    vpor %xmm13, %xmm15, %xmm13
5934; AVX512-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
5935; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
5936; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6))
5937; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm6
5938; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5))
5939; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm6[3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u,u,u]
5940; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
5941; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u,u,u]
5942; AVX512-FCP-NEXT:    vpor %xmm6, %xmm15, %xmm6
5943; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm15
5944; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm7 ^ (ymm15 & (ymm1 ^ ymm7))
5945; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2],ymm8[3],ymm15[4,5,6],ymm8[7,8],ymm15[9,10],ymm8[11],ymm15[12,13,14],ymm8[15]
5946; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5947; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17)
5948; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15]
5949; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7]
5950; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm20
5951; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm6
5952; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3))
5953; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm6[u,u,2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u]
5954; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
5955; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u]
5956; AVX512-FCP-NEXT:    vpor %xmm6, %xmm13, %xmm6
5957; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
5958; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[6,13]
5959; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
5960; AVX512-FCP-NEXT:    vpor %xmm13, %xmm15, %xmm13
5961; AVX512-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
5962; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6))
5963; AVX512-FCP-NEXT:    vmovdqa %ymm11, %ymm6
5964; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm4 ^ (ymm6 & (ymm5 ^ ymm4))
5965; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm15
5966; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
5967; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u]
5968; AVX512-FCP-NEXT:    vpor %xmm6, %xmm15, %xmm6
5969; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm15
5970; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm7 ^ (ymm15 & (ymm1 ^ ymm7))
5971; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2,3],ymm8[4],ymm15[5,6],ymm8[7,8],ymm15[9,10,11],ymm8[12],ymm15[13,14],ymm8[15]
5972; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5973; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17)
5974; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15]
5975; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7]
5976; AVX512-FCP-NEXT:    vmovdqa64 %ymm6, %ymm21
5977; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm6
5978; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3))
5979; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm6[u,u,3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u]
5980; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
5981; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u]
5982; AVX512-FCP-NEXT:    vpor %xmm6, %xmm15, %xmm6
5983; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
5984; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
5985; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14]
5986; AVX512-FCP-NEXT:    vpor %xmm15, %xmm13, %xmm13
5987; AVX512-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
5988; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6))
5989; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm6
5990; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm4 ^ (ymm6 & (ymm5 ^ ymm4))
5991; AVX512-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm15
5992; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
5993; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u]
5994; AVX512-FCP-NEXT:    vpor %xmm6, %xmm15, %xmm6
5995; AVX512-FCP-NEXT:    vmovdqa %ymm11, %ymm15
5996; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm7 ^ ymm1))
5997; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0],ymm8[1],ymm15[2,3],ymm8[4],ymm15[5,6,7,8],ymm8[9],ymm15[10,11],ymm8[12],ymm15[13,14,15]
5998; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5999; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17)
6000; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15]
6001; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7]
6002; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm2 ^ (ymm11 & (ymm3 ^ ymm2))
6003; AVX512-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm2
6004; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u]
6005; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u]
6006; AVX512-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
6007; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
6008; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
6009; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15]
6010; AVX512-FCP-NEXT:    vpor %xmm3, %xmm11, %xmm3
6011; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
6012; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm16 & (ymm3 ^ ymm2))
6013; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm4 ^ (ymm9 & (ymm5 ^ ymm4))
6014; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u]
6015; AVX512-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm4
6016; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u]
6017; AVX512-FCP-NEXT:    vpor %xmm2, %xmm4, %xmm2
6018; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm7 ^ ymm1))
6019; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7,8],ymm8[9],ymm0[10,11,12],ymm8[13],ymm0[14,15]
6020; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6021; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm17)
6022; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
6023; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6024; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, (%rsi)
6025; AVX512-FCP-NEXT:    vmovdqa %ymm10, (%rdx)
6026; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, (%rcx)
6027; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, (%r8)
6028; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, (%r9)
6029; AVX512-FCP-NEXT:    vmovdqa %ymm6, (%r10)
6030; AVX512-FCP-NEXT:    vmovdqa %ymm0, (%rax)
6031; AVX512-FCP-NEXT:    vzeroupper
6032; AVX512-FCP-NEXT:    retq
6033;
6034; AVX512DQ-LABEL: load_i8_stride7_vf32:
6035; AVX512DQ:       # %bb.0:
6036; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6037; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
6038; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
6039; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm2
6040; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm3
6041; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
6042; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm3 ^ ymm2))
6043; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm4
6044; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
6045; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
6046; AVX512DQ-NEXT:    vpor %xmm4, %xmm1, %xmm1
6047; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
6048; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %xmm4
6049; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
6050; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm4, %xmm6
6051; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm20
6052; AVX512DQ-NEXT:    vmovdqa 208(%rdi), %xmm5
6053; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
6054; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
6055; AVX512DQ-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
6056; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5,6],ymm6[7]
6057; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm14 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
6058; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm6
6059; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm7
6060; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm1
6061; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm9
6062; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm7 ^ (ymm9 & (ymm6 ^ ymm7))
6063; AVX512DQ-NEXT:    vextracti128 $1, %ymm9, %xmm10
6064; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u,u,u,u,u]
6065; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[0,7,14],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[u,u,u,u,u,u]
6066; AVX512DQ-NEXT:    vpor %xmm10, %xmm9, %xmm13
6067; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
6068; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %ymm9
6069; AVX512DQ-NEXT:    vmovdqa %ymm11, %ymm15
6070; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm9 ^ ymm1))
6071; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm10
6072; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15]
6073; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
6074; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm13 & mem)
6075; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
6076; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm12 ^ (ymm16 & (ymm8 ^ ymm12))
6077; AVX512DQ-NEXT:    vmovdqa64 %ymm8, %ymm18
6078; AVX512DQ-NEXT:    vmovdqa %ymm11, %ymm12
6079; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm2 ^ (ymm12 & (ymm3 ^ ymm2))
6080; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,6,13],zero,zero,xmm12[2,9],zero,zero,zero,xmm12[u,u,u,u]
6081; AVX512DQ-NEXT:    vextracti128 $1, %ymm12, %xmm12
6082; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[4,11],zero,zero,xmm12[0,7,14,u,u,u,u]
6083; AVX512DQ-NEXT:    vpor %xmm13, %xmm12, %xmm12
6084; AVX512DQ-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
6085; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
6086; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
6087; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3]
6088; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
6089; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7]
6090; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
6091; AVX512DQ-NEXT:    vmovdqa %ymm13, %ymm12
6092; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm7 ^ (ymm12 & (ymm6 ^ ymm7))
6093; AVX512DQ-NEXT:    vextracti128 $1, %ymm12, %xmm15
6094; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u]
6095; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[1,8,15],zero,zero,xmm12[4,11],zero,zero,xmm12[u,u,u,u,u,u,u]
6096; AVX512DQ-NEXT:    vpor %xmm15, %xmm12, %xmm15
6097; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm12
6098; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm9 ^ (ymm12 & (ymm1 ^ ymm9))
6099; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7,8,9],ymm10[10],ymm12[11,12,13],ymm10[14],ymm12[15]
6100; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
6101; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 | (ymm15 & ~mem)
6102; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm8 ^ (ymm16 & (ymm12 ^ ymm8))
6103; AVX512DQ-NEXT:    vmovdqa64 %ymm12, %ymm19
6104; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm8
6105; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm8 & (ymm6 ^ ymm7))
6106; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u]
6107; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm8
6108; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u]
6109; AVX512DQ-NEXT:    vpor %xmm15, %xmm8, %xmm8
6110; AVX512DQ-NEXT:    vmovdqa %ymm13, %ymm15
6111; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm9 ^ (ymm15 & (ymm1 ^ ymm9))
6112; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8,9,10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15]
6113; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6114; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
6115; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm8 & ymm17)
6116; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm3 ^ (ymm14 & (ymm2 ^ ymm3))
6117; AVX512DQ-NEXT:    vextracti128 $1, %ymm14, %xmm8
6118; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u]
6119; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u]
6120; AVX512DQ-NEXT:    vpor %xmm8, %xmm14, %xmm8
6121; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
6122; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm12
6123; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm5, %xmm14
6124; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
6125; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3]
6126; AVX512DQ-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
6127; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7]
6128; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm15[0],ymm8[1,2,3,4,5,6,7],ymm15[8],ymm8[9,10,11,12,13,14,15]
6129; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7]
6130; AVX512DQ-NEXT:    vmovdqa64 %ymm8, %ymm20
6131; AVX512DQ-NEXT:    vmovdqa %ymm13, %ymm8
6132; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3))
6133; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm12
6134; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u]
6135; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u]
6136; AVX512DQ-NEXT:    vpor %xmm12, %xmm8, %xmm8
6137; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
6138; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12]
6139; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
6140; AVX512DQ-NEXT:    vpor %xmm12, %xmm14, %xmm12
6141; AVX512DQ-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
6142; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
6143; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8))
6144; AVX512DQ-NEXT:    vmovdqa %ymm11, %ymm8
6145; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm8 & (ymm6 ^ ymm7))
6146; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u]
6147; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm8
6148; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u]
6149; AVX512DQ-NEXT:    vpor %xmm14, %xmm8, %xmm8
6150; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm14
6151; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm1 ^ ymm9))
6152; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2],ymm10[3],ymm14[4,5,6],ymm10[7,8],ymm14[9,10],ymm10[11],ymm14[12,13,14],ymm10[15]
6153; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6154; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm8 & ymm17)
6155; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15]
6156; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
6157; AVX512DQ-NEXT:    vmovdqa64 %ymm8, %ymm21
6158; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm8
6159; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3))
6160; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u]
6161; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm8
6162; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u]
6163; AVX512DQ-NEXT:    vpor %xmm12, %xmm8, %xmm8
6164; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
6165; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13]
6166; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
6167; AVX512DQ-NEXT:    vpor %xmm12, %xmm14, %xmm12
6168; AVX512DQ-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
6169; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8))
6170; AVX512DQ-NEXT:    vmovdqa %ymm13, %ymm8
6171; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm7 ^ ymm6))
6172; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm14
6173; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u]
6174; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u]
6175; AVX512DQ-NEXT:    vpor %xmm14, %xmm8, %xmm8
6176; AVX512DQ-NEXT:    vmovdqa %ymm11, %ymm14
6177; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm1 ^ ymm9))
6178; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2,3],ymm10[4],ymm14[5,6],ymm10[7,8],ymm14[9,10,11],ymm10[12],ymm14[13,14],ymm10[15]
6179; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6180; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm8 & ymm17)
6181; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15]
6182; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm8[4,5,6,7]
6183; AVX512DQ-NEXT:    vmovdqa %ymm11, %ymm8
6184; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3))
6185; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u]
6186; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm8
6187; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u]
6188; AVX512DQ-NEXT:    vpor %xmm12, %xmm8, %xmm8
6189; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
6190; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
6191; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14]
6192; AVX512DQ-NEXT:    vpor %xmm12, %xmm15, %xmm12
6193; AVX512DQ-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
6194; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8))
6195; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm8
6196; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm7 ^ ymm6))
6197; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm15
6198; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
6199; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u]
6200; AVX512DQ-NEXT:    vpor %xmm15, %xmm8, %xmm8
6201; AVX512DQ-NEXT:    vmovdqa %ymm13, %ymm15
6202; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm9 ^ ymm1))
6203; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6,7,8],ymm10[9],ymm15[10,11],ymm10[12],ymm15[13,14,15]
6204; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6205; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm8 & ymm17)
6206; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm8 = ymm15[0],ymm12[1,2,3,4,5,6,7],ymm15[8],ymm12[9,10,11,12,13,14,15]
6207; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7]
6208; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm2 ^ (ymm13 & (ymm3 ^ ymm2))
6209; AVX512DQ-NEXT:    vextracti128 $1, %ymm13, %xmm2
6210; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u]
6211; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm13[u,u,4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u]
6212; AVX512DQ-NEXT:    vpor %xmm2, %xmm3, %xmm2
6213; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
6214; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
6215; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15]
6216; AVX512DQ-NEXT:    vpor %xmm3, %xmm4, %xmm3
6217; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
6218; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm16 & (ymm3 ^ ymm2))
6219; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm6 ^ (ymm11 & (ymm7 ^ ymm6))
6220; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u]
6221; AVX512DQ-NEXT:    vextracti128 $1, %ymm11, %xmm4
6222; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u]
6223; AVX512DQ-NEXT:    vpor %xmm2, %xmm4, %xmm2
6224; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm9 ^ ymm1))
6225; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7,8],ymm10[9],ymm0[10,11,12],ymm10[13],ymm0[14,15]
6226; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6227; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm17)
6228; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
6229; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6230; AVX512DQ-NEXT:    vmovdqa64 %ymm18, (%rsi)
6231; AVX512DQ-NEXT:    vmovdqa64 %ymm19, (%rdx)
6232; AVX512DQ-NEXT:    vmovdqa64 %ymm20, (%rcx)
6233; AVX512DQ-NEXT:    vmovdqa64 %ymm21, (%r8)
6234; AVX512DQ-NEXT:    vmovdqa %ymm14, (%r9)
6235; AVX512DQ-NEXT:    vmovdqa %ymm8, (%r10)
6236; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rax)
6237; AVX512DQ-NEXT:    vzeroupper
6238; AVX512DQ-NEXT:    retq
6239;
6240; AVX512DQ-FCP-LABEL: load_i8_stride7_vf32:
6241; AVX512DQ-FCP:       # %bb.0:
6242; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6243; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
6244; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
6245; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm2
6246; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm3
6247; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm1
6248; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm3 ^ ymm2))
6249; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm4
6250; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
6251; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
6252; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm1, %xmm1
6253; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
6254; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,2,4,6]
6255; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %ymm12
6256; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm4, %ymm4
6257; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
6258; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5,6],ymm4[7]
6259; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
6260; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm4
6261; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
6262; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm1
6263; AVX512DQ-FCP-NEXT:    vmovdqa %ymm13, %ymm7
6264; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm4 ^ ymm5))
6265; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
6266; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u,u,u]
6267; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u,u,u]
6268; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm10
6269; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
6270; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm7
6271; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm11
6272; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm1 ^ (ymm11 & (ymm7 ^ ymm1))
6273; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdi), %xmm8
6274; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7,8,9],ymm8[10],ymm11[11,12],ymm8[13],ymm11[14,15]
6275; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
6276; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm10 & mem)
6277; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
6278; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm6 ^ (ymm16 & (ymm11 ^ ymm6))
6279; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm11, %ymm18
6280; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm6
6281; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm3 ^ ymm2))
6282; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u]
6283; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
6284; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u]
6285; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm10, %xmm6
6286; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
6287; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,3,4,6]
6288; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm10, %ymm10
6289; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
6290; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
6291; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
6292; AVX512DQ-FCP-NEXT:    vmovdqa %ymm11, %ymm10
6293; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm10 & (ymm4 ^ ymm5))
6294; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm14
6295; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u,u,u]
6296; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[1,8,15],zero,zero,xmm10[4,11],zero,zero,xmm10[u,u,u,u,u,u,u]
6297; AVX512DQ-FCP-NEXT:    vpor %xmm14, %xmm10, %xmm14
6298; AVX512DQ-FCP-NEXT:    vmovdqa %ymm13, %ymm10
6299; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm7 ^ (ymm10 & (ymm1 ^ ymm7))
6300; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm8[2],ymm10[3,4,5],ymm8[6],ymm10[7,8,9],ymm8[10],ymm10[11,12,13],ymm8[14],ymm10[15]
6301; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
6302; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm14 & ~mem)
6303; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm16 & (ymm10 ^ ymm6))
6304; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm6
6305; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5))
6306; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u]
6307; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
6308; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u]
6309; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm14, %xmm6
6310; AVX512DQ-FCP-NEXT:    vmovdqa %ymm11, %ymm14
6311; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm7 ^ (ymm14 & (ymm1 ^ ymm7))
6312; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7,8,9,10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15]
6313; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6314; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
6315; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm6 & ymm17)
6316; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm2 ^ ymm3))
6317; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm6
6318; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u]
6319; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u]
6320; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm13, %xmm6
6321; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
6322; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,5,6]
6323; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm13, %ymm12
6324; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
6325; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7]
6326; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1,2,3,4,5,6,7],ymm14[8],ymm6[9,10,11,12,13,14,15]
6327; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5,6,7]
6328; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm19
6329; AVX512DQ-FCP-NEXT:    vmovdqa %ymm11, %ymm6
6330; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3))
6331; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm12
6332; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u]
6333; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u]
6334; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm12, %xmm6
6335; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
6336; AVX512DQ-FCP-NEXT:    vmovdqa 208(%rdi), %xmm14
6337; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,12]
6338; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %xmm12
6339; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
6340; AVX512DQ-FCP-NEXT:    vpor %xmm13, %xmm15, %xmm13
6341; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
6342; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
6343; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6))
6344; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm6
6345; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5))
6346; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm6[3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u,u,u]
6347; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
6348; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u,u,u]
6349; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm15, %xmm6
6350; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm15
6351; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm7 ^ (ymm15 & (ymm1 ^ ymm7))
6352; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2],ymm8[3],ymm15[4,5,6],ymm8[7,8],ymm15[9,10],ymm8[11],ymm15[12,13,14],ymm8[15]
6353; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6354; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17)
6355; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15]
6356; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7]
6357; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm20
6358; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm6
6359; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3))
6360; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm6[u,u,2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u]
6361; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
6362; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u]
6363; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm13, %xmm6
6364; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
6365; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[6,13]
6366; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
6367; AVX512DQ-FCP-NEXT:    vpor %xmm13, %xmm15, %xmm13
6368; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
6369; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6))
6370; AVX512DQ-FCP-NEXT:    vmovdqa %ymm11, %ymm6
6371; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm4 ^ (ymm6 & (ymm5 ^ ymm4))
6372; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm15
6373; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
6374; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u]
6375; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm15, %xmm6
6376; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm15
6377; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm7 ^ (ymm15 & (ymm1 ^ ymm7))
6378; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2,3],ymm8[4],ymm15[5,6],ymm8[7,8],ymm15[9,10,11],ymm8[12],ymm15[13,14],ymm8[15]
6379; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6380; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17)
6381; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15]
6382; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7]
6383; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm6, %ymm21
6384; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm6
6385; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3))
6386; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm6[u,u,3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u]
6387; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
6388; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u]
6389; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm15, %xmm6
6390; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
6391; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
6392; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14]
6393; AVX512DQ-FCP-NEXT:    vpor %xmm15, %xmm13, %xmm13
6394; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
6395; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6))
6396; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm6
6397; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm4 ^ (ymm6 & (ymm5 ^ ymm4))
6398; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm15
6399; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
6400; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u]
6401; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm15, %xmm6
6402; AVX512DQ-FCP-NEXT:    vmovdqa %ymm11, %ymm15
6403; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm7 ^ ymm1))
6404; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0],ymm8[1],ymm15[2,3],ymm8[4],ymm15[5,6,7,8],ymm8[9],ymm15[10,11],ymm8[12],ymm15[13,14,15]
6405; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6406; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17)
6407; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15]
6408; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7]
6409; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm2 ^ (ymm11 & (ymm3 ^ ymm2))
6410; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm2
6411; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u]
6412; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u]
6413; AVX512DQ-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
6414; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
6415; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
6416; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15]
6417; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm11, %xmm3
6418; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
6419; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm16 & (ymm3 ^ ymm2))
6420; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm4 ^ (ymm9 & (ymm5 ^ ymm4))
6421; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u]
6422; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm4
6423; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u]
6424; AVX512DQ-FCP-NEXT:    vpor %xmm2, %xmm4, %xmm2
6425; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm7 ^ ymm1))
6426; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7,8],ymm8[9],ymm0[10,11,12],ymm8[13],ymm0[14,15]
6427; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6428; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm17)
6429; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
6430; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6431; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, (%rsi)
6432; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, (%rdx)
6433; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, (%rcx)
6434; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, (%r8)
6435; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, (%r9)
6436; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, (%r10)
6437; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, (%rax)
6438; AVX512DQ-FCP-NEXT:    vzeroupper
6439; AVX512DQ-FCP-NEXT:    retq
6440;
6441; AVX512BW-LABEL: load_i8_stride7_vf32:
6442; AVX512BW:       # %bb.0:
6443; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6444; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
6445; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31]
6446; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
6447; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
6448; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31]
6449; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm4
6450; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0]
6451; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm5
6452; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0]
6453; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm11
6454; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15]
6455; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm12
6456; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31]
6457; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm10
6458; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31]
6459; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm6
6460; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm3
6461; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm2
6462; AVX512BW-NEXT:    movw $-28382, %r11w # imm = 0x9122
6463; AVX512BW-NEXT:    kmovd %r11d, %k5
6464; AVX512BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm1 {%k5}
6465; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm7
6466; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u]
6467; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
6468; AVX512BW-NEXT:    vpor %xmm7, %xmm1, %xmm1
6469; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
6470; AVX512BW-NEXT:    movw $992, %r11w # imm = 0x3E0
6471; AVX512BW-NEXT:    kmovd %r11d, %k1
6472; AVX512BW-NEXT:    vmovdqu16 %ymm6, %ymm1 {%k1}
6473; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm7
6474; AVX512BW-NEXT:    vmovdqa 160(%rdi), %ymm6
6475; AVX512BW-NEXT:    movw $8772, %r11w # imm = 0x2244
6476; AVX512BW-NEXT:    kmovd %r11d, %k1
6477; AVX512BW-NEXT:    vpblendmw %ymm7, %ymm6, %ymm8 {%k1}
6478; AVX512BW-NEXT:    vextracti128 $1, %ymm8, %xmm9
6479; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u]
6480; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u]
6481; AVX512BW-NEXT:    vpor %xmm9, %xmm8, %xmm8
6482; AVX512BW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm13
6483; AVX512BW-NEXT:    vmovdqa 192(%rdi), %xmm8
6484; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
6485; AVX512BW-NEXT:    vpshufb %xmm14, %xmm8, %xmm15
6486; AVX512BW-NEXT:    vmovdqa 208(%rdi), %xmm9
6487; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm16 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
6488; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3]
6489; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
6490; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
6491; AVX512BW-NEXT:    movl $-524288, %edi # imm = 0xFFF80000
6492; AVX512BW-NEXT:    kmovd %edi, %k4
6493; AVX512BW-NEXT:    vmovdqu8 %ymm13, %ymm1 {%k4}
6494; AVX512BW-NEXT:    movw $4644, %di # imm = 0x1224
6495; AVX512BW-NEXT:    kmovd %edi, %k2
6496; AVX512BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm13 {%k2}
6497; AVX512BW-NEXT:    vextracti128 $1, %ymm13, %xmm15
6498; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u]
6499; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u]
6500; AVX512BW-NEXT:    vpor %xmm15, %xmm13, %xmm13
6501; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
6502; AVX512BW-NEXT:    movl $511, %edi # imm = 0x1FF
6503; AVX512BW-NEXT:    kmovd %edi, %k3
6504; AVX512BW-NEXT:    vmovdqu8 %ymm13, %ymm10 {%k3}
6505; AVX512BW-NEXT:    movw $9288, %di # imm = 0x2448
6506; AVX512BW-NEXT:    kmovd %edi, %k3
6507; AVX512BW-NEXT:    vpblendmw %ymm7, %ymm6, %ymm13 {%k3}
6508; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,6,13],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[u,u,u,u]
6509; AVX512BW-NEXT:    vextracti128 $1, %ymm13, %xmm13
6510; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,xmm13[4,11],zero,zero,xmm13[0,7,14,u,u,u,u]
6511; AVX512BW-NEXT:    vpor %xmm15, %xmm13, %xmm13
6512; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
6513; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
6514; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm16 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
6515; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3]
6516; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
6517; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
6518; AVX512BW-NEXT:    vmovdqu8 %ymm13, %ymm10 {%k4}
6519; AVX512BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm13 {%k1}
6520; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u]
6521; AVX512BW-NEXT:    vextracti128 $1, %ymm13, %xmm13
6522; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u]
6523; AVX512BW-NEXT:    vpor %xmm15, %xmm13, %xmm13
6524; AVX512BW-NEXT:    movl $261632, %edi # imm = 0x3FE00
6525; AVX512BW-NEXT:    kmovd %edi, %k4
6526; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm13 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6527; AVX512BW-NEXT:    vpblendmw %ymm6, %ymm7, %ymm12 {%k5}
6528; AVX512BW-NEXT:    vextracti128 $1, %ymm12, %xmm15
6529; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u]
6530; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
6531; AVX512BW-NEXT:    vpor %xmm15, %xmm12, %xmm12
6532; AVX512BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
6533; AVX512BW-NEXT:    vpshufb %xmm14, %xmm9, %xmm14
6534; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
6535; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
6536; AVX512BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
6537; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7]
6538; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15]
6539; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
6540; AVX512BW-NEXT:    vpblendmw %ymm6, %ymm7, %ymm13 {%k2}
6541; AVX512BW-NEXT:    vextracti128 $1, %ymm13, %xmm14
6542; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u]
6543; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u]
6544; AVX512BW-NEXT:    vpor %xmm14, %xmm13, %xmm13
6545; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
6546; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12]
6547; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
6548; AVX512BW-NEXT:    vpor %xmm14, %xmm15, %xmm14
6549; AVX512BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
6550; AVX512BW-NEXT:    movl $-134217728, %edi # imm = 0xF8000000
6551; AVX512BW-NEXT:    kmovd %edi, %k5
6552; AVX512BW-NEXT:    vmovdqu8 %ymm14, %ymm13 {%k5}
6553; AVX512BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm14 {%k3}
6554; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
6555; AVX512BW-NEXT:    vextracti128 $1, %ymm14, %xmm14
6556; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
6557; AVX512BW-NEXT:    vpor %xmm15, %xmm14, %xmm14
6558; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6559; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm11 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
6560; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7]
6561; AVX512BW-NEXT:    vpblendmw %ymm6, %ymm7, %ymm13 {%k1}
6562; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u]
6563; AVX512BW-NEXT:    vextracti128 $1, %ymm13, %xmm13
6564; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u]
6565; AVX512BW-NEXT:    vpor %xmm14, %xmm13, %xmm13
6566; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
6567; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13]
6568; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
6569; AVX512BW-NEXT:    vpor %xmm14, %xmm15, %xmm14
6570; AVX512BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
6571; AVX512BW-NEXT:    vmovdqu8 %ymm14, %ymm13 {%k5}
6572; AVX512BW-NEXT:    vpblendmw %ymm3, %ymm2, %ymm14 {%k2}
6573; AVX512BW-NEXT:    vextracti128 $1, %ymm14, %xmm15
6574; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
6575; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u]
6576; AVX512BW-NEXT:    vpor %xmm15, %xmm14, %xmm14
6577; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6578; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
6579; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7]
6580; AVX512BW-NEXT:    vpblendmw %ymm6, %ymm7, %ymm13 {%k3}
6581; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
6582; AVX512BW-NEXT:    vextracti128 $1, %ymm13, %xmm13
6583; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u]
6584; AVX512BW-NEXT:    vpor %xmm14, %xmm13, %xmm13
6585; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
6586; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
6587; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14]
6588; AVX512BW-NEXT:    vpor %xmm14, %xmm15, %xmm14
6589; AVX512BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
6590; AVX512BW-NEXT:    vmovdqu8 %ymm14, %ymm13 {%k5}
6591; AVX512BW-NEXT:    vpblendmw %ymm3, %ymm2, %ymm14 {%k1}
6592; AVX512BW-NEXT:    vextracti128 $1, %ymm14, %xmm15
6593; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
6594; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u]
6595; AVX512BW-NEXT:    vpor %xmm15, %xmm14, %xmm14
6596; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6597; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
6598; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7]
6599; AVX512BW-NEXT:    vmovdqu16 %ymm7, %ymm6 {%k2}
6600; AVX512BW-NEXT:    vextracti128 $1, %ymm6, %xmm7
6601; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u]
6602; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
6603; AVX512BW-NEXT:    vpor %xmm7, %xmm6, %xmm6
6604; AVX512BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
6605; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
6606; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15]
6607; AVX512BW-NEXT:    vpor %xmm7, %xmm8, %xmm7
6608; AVX512BW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
6609; AVX512BW-NEXT:    vmovdqu8 %ymm7, %ymm6 {%k5}
6610; AVX512BW-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k3}
6611; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u]
6612; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
6613; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
6614; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
6615; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6616; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15]
6617; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
6618; AVX512BW-NEXT:    vmovdqa %ymm1, (%rsi)
6619; AVX512BW-NEXT:    vmovdqa %ymm10, (%rdx)
6620; AVX512BW-NEXT:    vmovdqa %ymm12, (%rcx)
6621; AVX512BW-NEXT:    vmovdqa %ymm11, (%r8)
6622; AVX512BW-NEXT:    vmovdqa %ymm5, (%r9)
6623; AVX512BW-NEXT:    vmovdqa %ymm4, (%r10)
6624; AVX512BW-NEXT:    vmovdqa %ymm0, (%rax)
6625; AVX512BW-NEXT:    vzeroupper
6626; AVX512BW-NEXT:    retq
6627;
6628; AVX512BW-FCP-LABEL: load_i8_stride7_vf32:
6629; AVX512BW-FCP:       # %bb.0:
6630; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6631; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
6632; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
6633; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
6634; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
6635; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
6636; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm4
6637; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
6638; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm5
6639; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7]
6640; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm9
6641; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15]
6642; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm10
6643; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31]
6644; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm8
6645; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31]
6646; AVX512BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm6
6647; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm3
6648; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
6649; AVX512BW-FCP-NEXT:    movw $-28382, %r11w # imm = 0x9122
6650; AVX512BW-FCP-NEXT:    kmovd %r11d, %k5
6651; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm1 {%k5}
6652; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm7
6653; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u]
6654; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
6655; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm1, %xmm1
6656; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
6657; AVX512BW-FCP-NEXT:    movw $992, %r11w # imm = 0x3E0
6658; AVX512BW-FCP-NEXT:    kmovd %r11d, %k1
6659; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm1 {%k1}
6660; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm7
6661; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm6
6662; AVX512BW-FCP-NEXT:    movw $8772, %r11w # imm = 0x2244
6663; AVX512BW-FCP-NEXT:    kmovd %r11d, %k1
6664; AVX512BW-FCP-NEXT:    vpblendmw %ymm7, %ymm6, %ymm11 {%k1}
6665; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
6666; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u]
6667; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u]
6668; AVX512BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
6669; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
6670; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,2,4,6]
6671; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm13
6672; AVX512BW-FCP-NEXT:    vpermd %ymm13, %ymm12, %ymm12
6673; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
6674; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
6675; AVX512BW-FCP-NEXT:    movl $-524288, %r11d # imm = 0xFFF80000
6676; AVX512BW-FCP-NEXT:    kmovd %r11d, %k4
6677; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm11, %ymm1 {%k4}
6678; AVX512BW-FCP-NEXT:    movw $4644, %r11w # imm = 0x1224
6679; AVX512BW-FCP-NEXT:    kmovd %r11d, %k2
6680; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm11 {%k2}
6681; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
6682; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u,u,u]
6683; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u,u,u]
6684; AVX512BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
6685; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
6686; AVX512BW-FCP-NEXT:    movl $511, %r11d # imm = 0x1FF
6687; AVX512BW-FCP-NEXT:    kmovd %r11d, %k3
6688; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm11, %ymm8 {%k3}
6689; AVX512BW-FCP-NEXT:    movw $9288, %r11w # imm = 0x2448
6690; AVX512BW-FCP-NEXT:    kmovd %r11d, %k3
6691; AVX512BW-FCP-NEXT:    vpblendmw %ymm7, %ymm6, %ymm11 {%k3}
6692; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u]
6693; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
6694; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u]
6695; AVX512BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
6696; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
6697; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,4,6]
6698; AVX512BW-FCP-NEXT:    vpermd %ymm13, %ymm12, %ymm12
6699; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
6700; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
6701; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm11, %ymm8 {%k4}
6702; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm11 {%k1}
6703; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u]
6704; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
6705; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u]
6706; AVX512BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
6707; AVX512BW-FCP-NEXT:    movl $261632, %r11d # imm = 0x3FE00
6708; AVX512BW-FCP-NEXT:    kmovd %r11d, %k4
6709; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm11 {%k4} = ymm10[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6710; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm7, %ymm10 {%k5}
6711; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm12
6712; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[1,8,15,u,u,u,u]
6713; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u]
6714; AVX512BW-FCP-NEXT:    vpor %xmm12, %xmm10, %xmm10
6715; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
6716; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,5,6]
6717; AVX512BW-FCP-NEXT:    vpermd %ymm13, %ymm12, %ymm12
6718; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
6719; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7]
6720; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15]
6721; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
6722; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm7, %ymm11 {%k2}
6723; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
6724; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u]
6725; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u]
6726; AVX512BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
6727; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm13
6728; AVX512BW-FCP-NEXT:    vmovdqa 208(%rdi), %xmm11
6729; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[5,12]
6730; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %xmm12
6731; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
6732; AVX512BW-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
6733; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
6734; AVX512BW-FCP-NEXT:    movl $-134217728, %edi # imm = 0xF8000000
6735; AVX512BW-FCP-NEXT:    kmovd %edi, %k5
6736; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm14, %ymm13 {%k5}
6737; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm14 {%k3}
6738; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
6739; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
6740; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
6741; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
6742; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6743; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
6744; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7]
6745; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm7, %ymm13 {%k1}
6746; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u]
6747; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
6748; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u]
6749; AVX512BW-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm13
6750; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
6751; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[6,13]
6752; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
6753; AVX512BW-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
6754; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
6755; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm14, %ymm13 {%k5}
6756; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm14 {%k2}
6757; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
6758; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
6759; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u]
6760; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
6761; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6762; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
6763; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7]
6764; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm7, %ymm13 {%k3}
6765; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
6766; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
6767; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u]
6768; AVX512BW-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm13
6769; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
6770; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
6771; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[0,7,14]
6772; AVX512BW-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
6773; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
6774; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm14, %ymm13 {%k5}
6775; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm14 {%k1}
6776; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
6777; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
6778; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u]
6779; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
6780; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6781; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
6782; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7]
6783; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm7, %ymm6 {%k2}
6784; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
6785; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u]
6786; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
6787; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
6788; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
6789; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
6790; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[1,8,15]
6791; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm11, %xmm7
6792; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
6793; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm7, %ymm6 {%k5}
6794; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k3}
6795; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u]
6796; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm2
6797; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
6798; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
6799; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6800; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15]
6801; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
6802; AVX512BW-FCP-NEXT:    vmovdqa %ymm1, (%rsi)
6803; AVX512BW-FCP-NEXT:    vmovdqa %ymm8, (%rdx)
6804; AVX512BW-FCP-NEXT:    vmovdqa %ymm10, (%rcx)
6805; AVX512BW-FCP-NEXT:    vmovdqa %ymm9, (%r8)
6806; AVX512BW-FCP-NEXT:    vmovdqa %ymm5, (%r9)
6807; AVX512BW-FCP-NEXT:    vmovdqa %ymm4, (%r10)
6808; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, (%rax)
6809; AVX512BW-FCP-NEXT:    vzeroupper
6810; AVX512BW-FCP-NEXT:    retq
6811;
6812; AVX512DQ-BW-LABEL: load_i8_stride7_vf32:
6813; AVX512DQ-BW:       # %bb.0:
6814; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6815; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
6816; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31]
6817; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
6818; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
6819; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31]
6820; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm2, %zmm4
6821; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0]
6822; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm2, %zmm5
6823; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0]
6824; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm2, %zmm11
6825; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15]
6826; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm2, %zmm12
6827; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31]
6828; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm2, %zmm10
6829; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31]
6830; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm2, %zmm6
6831; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm3
6832; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm2
6833; AVX512DQ-BW-NEXT:    movw $-28382, %r11w # imm = 0x9122
6834; AVX512DQ-BW-NEXT:    kmovd %r11d, %k5
6835; AVX512DQ-BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm1 {%k5}
6836; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm1, %xmm7
6837; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u]
6838; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
6839; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm1, %xmm1
6840; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
6841; AVX512DQ-BW-NEXT:    movw $992, %r11w # imm = 0x3E0
6842; AVX512DQ-BW-NEXT:    kmovd %r11d, %k1
6843; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm6, %ymm1 {%k1}
6844; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %ymm7
6845; AVX512DQ-BW-NEXT:    vmovdqa 160(%rdi), %ymm6
6846; AVX512DQ-BW-NEXT:    movw $8772, %r11w # imm = 0x2244
6847; AVX512DQ-BW-NEXT:    kmovd %r11d, %k1
6848; AVX512DQ-BW-NEXT:    vpblendmw %ymm7, %ymm6, %ymm8 {%k1}
6849; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm8, %xmm9
6850; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u]
6851; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u]
6852; AVX512DQ-BW-NEXT:    vpor %xmm9, %xmm8, %xmm8
6853; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm13
6854; AVX512DQ-BW-NEXT:    vmovdqa 192(%rdi), %xmm8
6855; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm14 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
6856; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm8, %xmm15
6857; AVX512DQ-BW-NEXT:    vmovdqa 208(%rdi), %xmm9
6858; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm16 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
6859; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3]
6860; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
6861; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
6862; AVX512DQ-BW-NEXT:    movl $-524288, %edi # imm = 0xFFF80000
6863; AVX512DQ-BW-NEXT:    kmovd %edi, %k4
6864; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm13, %ymm1 {%k4}
6865; AVX512DQ-BW-NEXT:    movw $4644, %di # imm = 0x1224
6866; AVX512DQ-BW-NEXT:    kmovd %edi, %k2
6867; AVX512DQ-BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm13 {%k2}
6868; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm13, %xmm15
6869; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u]
6870; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u]
6871; AVX512DQ-BW-NEXT:    vpor %xmm15, %xmm13, %xmm13
6872; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
6873; AVX512DQ-BW-NEXT:    movl $511, %edi # imm = 0x1FF
6874; AVX512DQ-BW-NEXT:    kmovd %edi, %k3
6875; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm13, %ymm10 {%k3}
6876; AVX512DQ-BW-NEXT:    movw $9288, %di # imm = 0x2448
6877; AVX512DQ-BW-NEXT:    kmovd %edi, %k3
6878; AVX512DQ-BW-NEXT:    vpblendmw %ymm7, %ymm6, %ymm13 {%k3}
6879; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,6,13],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[u,u,u,u]
6880; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm13, %xmm13
6881; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,xmm13[4,11],zero,zero,xmm13[0,7,14,u,u,u,u]
6882; AVX512DQ-BW-NEXT:    vpor %xmm15, %xmm13, %xmm13
6883; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
6884; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
6885; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm16 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
6886; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3]
6887; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
6888; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
6889; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm13, %ymm10 {%k4}
6890; AVX512DQ-BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm13 {%k1}
6891; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u]
6892; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm13, %xmm13
6893; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u]
6894; AVX512DQ-BW-NEXT:    vpor %xmm15, %xmm13, %xmm13
6895; AVX512DQ-BW-NEXT:    movl $261632, %edi # imm = 0x3FE00
6896; AVX512DQ-BW-NEXT:    kmovd %edi, %k4
6897; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm13 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6898; AVX512DQ-BW-NEXT:    vpblendmw %ymm6, %ymm7, %ymm12 {%k5}
6899; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm12, %xmm15
6900; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u]
6901; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
6902; AVX512DQ-BW-NEXT:    vpor %xmm15, %xmm12, %xmm12
6903; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
6904; AVX512DQ-BW-NEXT:    vpshufb %xmm14, %xmm9, %xmm14
6905; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
6906; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
6907; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
6908; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7]
6909; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15]
6910; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
6911; AVX512DQ-BW-NEXT:    vpblendmw %ymm6, %ymm7, %ymm13 {%k2}
6912; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm13, %xmm14
6913; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u]
6914; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u]
6915; AVX512DQ-BW-NEXT:    vpor %xmm14, %xmm13, %xmm13
6916; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
6917; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12]
6918; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
6919; AVX512DQ-BW-NEXT:    vpor %xmm14, %xmm15, %xmm14
6920; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
6921; AVX512DQ-BW-NEXT:    movl $-134217728, %edi # imm = 0xF8000000
6922; AVX512DQ-BW-NEXT:    kmovd %edi, %k5
6923; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm14, %ymm13 {%k5}
6924; AVX512DQ-BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm14 {%k3}
6925; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
6926; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm14, %xmm14
6927; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
6928; AVX512DQ-BW-NEXT:    vpor %xmm15, %xmm14, %xmm14
6929; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6930; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm11 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
6931; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7]
6932; AVX512DQ-BW-NEXT:    vpblendmw %ymm6, %ymm7, %ymm13 {%k1}
6933; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u]
6934; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm13, %xmm13
6935; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u]
6936; AVX512DQ-BW-NEXT:    vpor %xmm14, %xmm13, %xmm13
6937; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
6938; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13]
6939; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
6940; AVX512DQ-BW-NEXT:    vpor %xmm14, %xmm15, %xmm14
6941; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
6942; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm14, %ymm13 {%k5}
6943; AVX512DQ-BW-NEXT:    vpblendmw %ymm3, %ymm2, %ymm14 {%k2}
6944; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm14, %xmm15
6945; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
6946; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u]
6947; AVX512DQ-BW-NEXT:    vpor %xmm15, %xmm14, %xmm14
6948; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6949; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
6950; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7]
6951; AVX512DQ-BW-NEXT:    vpblendmw %ymm6, %ymm7, %ymm13 {%k3}
6952; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
6953; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm13, %xmm13
6954; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u]
6955; AVX512DQ-BW-NEXT:    vpor %xmm14, %xmm13, %xmm13
6956; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
6957; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
6958; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14]
6959; AVX512DQ-BW-NEXT:    vpor %xmm14, %xmm15, %xmm14
6960; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
6961; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm14, %ymm13 {%k5}
6962; AVX512DQ-BW-NEXT:    vpblendmw %ymm3, %ymm2, %ymm14 {%k1}
6963; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm14, %xmm15
6964; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
6965; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u]
6966; AVX512DQ-BW-NEXT:    vpor %xmm15, %xmm14, %xmm14
6967; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6968; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
6969; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7]
6970; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm7, %ymm6 {%k2}
6971; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm6, %xmm7
6972; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u]
6973; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
6974; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm6, %xmm6
6975; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
6976; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
6977; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15]
6978; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm8, %xmm7
6979; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
6980; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm7, %ymm6 {%k5}
6981; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k3}
6982; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u]
6983; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
6984; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
6985; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
6986; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6987; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15]
6988; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
6989; AVX512DQ-BW-NEXT:    vmovdqa %ymm1, (%rsi)
6990; AVX512DQ-BW-NEXT:    vmovdqa %ymm10, (%rdx)
6991; AVX512DQ-BW-NEXT:    vmovdqa %ymm12, (%rcx)
6992; AVX512DQ-BW-NEXT:    vmovdqa %ymm11, (%r8)
6993; AVX512DQ-BW-NEXT:    vmovdqa %ymm5, (%r9)
6994; AVX512DQ-BW-NEXT:    vmovdqa %ymm4, (%r10)
6995; AVX512DQ-BW-NEXT:    vmovdqa %ymm0, (%rax)
6996; AVX512DQ-BW-NEXT:    vzeroupper
6997; AVX512DQ-BW-NEXT:    retq
6998;
6999; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf32:
7000; AVX512DQ-BW-FCP:       # %bb.0:
7001; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
7002; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
7003; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
7004; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm1
7005; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm0, %zmm0
7006; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
7007; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm4
7008; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
7009; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm5
7010; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7]
7011; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm9
7012; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15]
7013; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm10
7014; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31]
7015; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm8
7016; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31]
7017; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm1, %zmm2, %zmm6
7018; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm3
7019; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
7020; AVX512DQ-BW-FCP-NEXT:    movw $-28382, %r11w # imm = 0x9122
7021; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k5
7022; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm1 {%k5}
7023; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm7
7024; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u]
7025; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
7026; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm1, %xmm1
7027; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
7028; AVX512DQ-BW-FCP-NEXT:    movw $992, %r11w # imm = 0x3E0
7029; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k1
7030; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm1 {%k1}
7031; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm7
7032; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm6
7033; AVX512DQ-BW-FCP-NEXT:    movw $8772, %r11w # imm = 0x2244
7034; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k1
7035; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm7, %ymm6, %ymm11 {%k1}
7036; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
7037; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u]
7038; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u]
7039; AVX512DQ-BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
7040; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
7041; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,2,4,6]
7042; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm13
7043; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm13, %ymm12, %ymm12
7044; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
7045; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
7046; AVX512DQ-BW-FCP-NEXT:    movl $-524288, %r11d # imm = 0xFFF80000
7047; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k4
7048; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm11, %ymm1 {%k4}
7049; AVX512DQ-BW-FCP-NEXT:    movw $4644, %r11w # imm = 0x1224
7050; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k2
7051; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm11 {%k2}
7052; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
7053; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u,u,u]
7054; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u,u,u]
7055; AVX512DQ-BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
7056; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
7057; AVX512DQ-BW-FCP-NEXT:    movl $511, %r11d # imm = 0x1FF
7058; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k3
7059; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm11, %ymm8 {%k3}
7060; AVX512DQ-BW-FCP-NEXT:    movw $9288, %r11w # imm = 0x2448
7061; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k3
7062; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm7, %ymm6, %ymm11 {%k3}
7063; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u]
7064; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
7065; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u]
7066; AVX512DQ-BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
7067; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
7068; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,4,6]
7069; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm13, %ymm12, %ymm12
7070; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
7071; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
7072; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm11, %ymm8 {%k4}
7073; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm11 {%k1}
7074; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u]
7075; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
7076; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u]
7077; AVX512DQ-BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
7078; AVX512DQ-BW-FCP-NEXT:    movl $261632, %r11d # imm = 0x3FE00
7079; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k4
7080; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm11 {%k4} = ymm10[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
7081; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm7, %ymm10 {%k5}
7082; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm12
7083; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[1,8,15,u,u,u,u]
7084; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u]
7085; AVX512DQ-BW-FCP-NEXT:    vpor %xmm12, %xmm10, %xmm10
7086; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
7087; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,5,6]
7088; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm13, %ymm12, %ymm12
7089; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
7090; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7]
7091; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15]
7092; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
7093; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm7, %ymm11 {%k2}
7094; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
7095; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u]
7096; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u]
7097; AVX512DQ-BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
7098; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm13
7099; AVX512DQ-BW-FCP-NEXT:    vmovdqa 208(%rdi), %xmm11
7100; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[5,12]
7101; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %xmm12
7102; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
7103; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
7104; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
7105; AVX512DQ-BW-FCP-NEXT:    movl $-134217728, %edi # imm = 0xF8000000
7106; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k5
7107; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm14, %ymm13 {%k5}
7108; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm14 {%k3}
7109; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
7110; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
7111; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
7112; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
7113; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
7114; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
7115; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7]
7116; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm7, %ymm13 {%k1}
7117; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u]
7118; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
7119; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u]
7120; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm13
7121; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
7122; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[6,13]
7123; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
7124; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
7125; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
7126; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm14, %ymm13 {%k5}
7127; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm14 {%k2}
7128; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
7129; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
7130; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u]
7131; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
7132; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
7133; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
7134; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7]
7135; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm7, %ymm13 {%k3}
7136; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
7137; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
7138; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u]
7139; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm13
7140; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
7141; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
7142; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[0,7,14]
7143; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
7144; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
7145; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm14, %ymm13 {%k5}
7146; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm14 {%k1}
7147; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
7148; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
7149; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u]
7150; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
7151; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
7152; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
7153; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7]
7154; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm7, %ymm6 {%k2}
7155; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
7156; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u]
7157; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
7158; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
7159; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
7160; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
7161; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[1,8,15]
7162; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm11, %xmm7
7163; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
7164; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm7, %ymm6 {%k5}
7165; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k3}
7166; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u]
7167; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm2
7168; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
7169; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
7170; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
7171; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15]
7172; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
7173; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm1, (%rsi)
7174; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm8, (%rdx)
7175; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm10, (%rcx)
7176; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm9, (%r8)
7177; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm5, (%r9)
7178; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm4, (%r10)
7179; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, (%rax)
7180; AVX512DQ-BW-FCP-NEXT:    vzeroupper
7181; AVX512DQ-BW-FCP-NEXT:    retq
7182  %wide.vec = load <224 x i8>, ptr %in.vec, align 64
7183  %strided.vec0 = shufflevector <224 x i8> %wide.vec, <224 x i8> poison, <32 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105, i32 112, i32 119, i32 126, i32 133, i32 140, i32 147, i32 154, i32 161, i32 168, i32 175, i32 182, i32 189, i32 196, i32 203, i32 210, i32 217>
7184  %strided.vec1 = shufflevector <224 x i8> %wide.vec, <224 x i8> poison, <32 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106, i32 113, i32 120, i32 127, i32 134, i32 141, i32 148, i32 155, i32 162, i32 169, i32 176, i32 183, i32 190, i32 197, i32 204, i32 211, i32 218>
7185  %strided.vec2 = shufflevector <224 x i8> %wide.vec, <224 x i8> poison, <32 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107, i32 114, i32 121, i32 128, i32 135, i32 142, i32 149, i32 156, i32 163, i32 170, i32 177, i32 184, i32 191, i32 198, i32 205, i32 212, i32 219>
7186  %strided.vec3 = shufflevector <224 x i8> %wide.vec, <224 x i8> poison, <32 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108, i32 115, i32 122, i32 129, i32 136, i32 143, i32 150, i32 157, i32 164, i32 171, i32 178, i32 185, i32 192, i32 199, i32 206, i32 213, i32 220>
7187  %strided.vec4 = shufflevector <224 x i8> %wide.vec, <224 x i8> poison, <32 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109, i32 116, i32 123, i32 130, i32 137, i32 144, i32 151, i32 158, i32 165, i32 172, i32 179, i32 186, i32 193, i32 200, i32 207, i32 214, i32 221>
7188  %strided.vec5 = shufflevector <224 x i8> %wide.vec, <224 x i8> poison, <32 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110, i32 117, i32 124, i32 131, i32 138, i32 145, i32 152, i32 159, i32 166, i32 173, i32 180, i32 187, i32 194, i32 201, i32 208, i32 215, i32 222>
7189  %strided.vec6 = shufflevector <224 x i8> %wide.vec, <224 x i8> poison, <32 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111, i32 118, i32 125, i32 132, i32 139, i32 146, i32 153, i32 160, i32 167, i32 174, i32 181, i32 188, i32 195, i32 202, i32 209, i32 216, i32 223>
7190  store <32 x i8> %strided.vec0, ptr %out.vec0, align 64
7191  store <32 x i8> %strided.vec1, ptr %out.vec1, align 64
7192  store <32 x i8> %strided.vec2, ptr %out.vec2, align 64
7193  store <32 x i8> %strided.vec3, ptr %out.vec3, align 64
7194  store <32 x i8> %strided.vec4, ptr %out.vec4, align 64
7195  store <32 x i8> %strided.vec5, ptr %out.vec5, align 64
7196  store <32 x i8> %strided.vec6, ptr %out.vec6, align 64
7197  ret void
7198}
7199
7200define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
7201; SSE-LABEL: load_i8_stride7_vf64:
7202; SSE:       # %bb.0:
7203; SSE-NEXT:    subq $1528, %rsp # imm = 0x5F8
7204; SSE-NEXT:    movdqa 208(%rdi), %xmm12
7205; SSE-NEXT:    movdqa 192(%rdi), %xmm5
7206; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7207; SSE-NEXT:    movdqa 176(%rdi), %xmm8
7208; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7209; SSE-NEXT:    movdqa 112(%rdi), %xmm4
7210; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7211; SSE-NEXT:    movdqa 128(%rdi), %xmm3
7212; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7213; SSE-NEXT:    movdqa 160(%rdi), %xmm6
7214; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7215; SSE-NEXT:    movdqa 144(%rdi), %xmm1
7216; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7217; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535]
7218; SSE-NEXT:    movdqa %xmm2, %xmm0
7219; SSE-NEXT:    pandn %xmm1, %xmm0
7220; SSE-NEXT:    movdqa %xmm6, %xmm1
7221; SSE-NEXT:    pand %xmm2, %xmm1
7222; SSE-NEXT:    movdqa %xmm2, %xmm7
7223; SSE-NEXT:    por %xmm0, %xmm1
7224; SSE-NEXT:    pxor %xmm6, %xmm6
7225; SSE-NEXT:    movdqa %xmm1, %xmm0
7226; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
7227; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
7228; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7229; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
7230; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
7231; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,5,6]
7232; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
7233; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
7234; SSE-NEXT:    packuswb %xmm0, %xmm2
7235; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255]
7236; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [65535,65535,0,65535,65535,65535,0,65535]
7237; SSE-NEXT:    movdqa %xmm11, %xmm1
7238; SSE-NEXT:    pandn %xmm3, %xmm1
7239; SSE-NEXT:    movdqa %xmm4, %xmm3
7240; SSE-NEXT:    pand %xmm11, %xmm3
7241; SSE-NEXT:    por %xmm1, %xmm3
7242; SSE-NEXT:    movdqa %xmm3, %xmm1
7243; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
7244; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,0,65535,0,65535]
7245; SSE-NEXT:    movdqa %xmm10, %xmm4
7246; SSE-NEXT:    pandn %xmm1, %xmm4
7247; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
7248; SSE-NEXT:    pand %xmm10, %xmm3
7249; SSE-NEXT:    por %xmm4, %xmm3
7250; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3]
7251; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
7252; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1]
7253; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
7254; SSE-NEXT:    packuswb %xmm1, %xmm1
7255; SSE-NEXT:    pand %xmm0, %xmm1
7256; SSE-NEXT:    movdqa %xmm0, %xmm3
7257; SSE-NEXT:    pandn %xmm2, %xmm3
7258; SSE-NEXT:    por %xmm3, %xmm1
7259; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,65535,0,65535]
7260; SSE-NEXT:    movdqa %xmm9, %xmm2
7261; SSE-NEXT:    pandn %xmm8, %xmm2
7262; SSE-NEXT:    movdqa %xmm5, %xmm3
7263; SSE-NEXT:    pand %xmm9, %xmm3
7264; SSE-NEXT:    por %xmm2, %xmm3
7265; SSE-NEXT:    movdqa %xmm3, %xmm2
7266; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
7267; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7]
7268; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15]
7269; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
7270; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
7271; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
7272; SSE-NEXT:    movdqa %xmm12, %xmm3
7273; SSE-NEXT:    movdqa %xmm12, %xmm4
7274; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
7275; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7276; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
7277; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7278; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
7279; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
7280; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
7281; SSE-NEXT:    packuswb %xmm3, %xmm3
7282; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,65535,65535,0]
7283; SSE-NEXT:    movdqa %xmm12, %xmm4
7284; SSE-NEXT:    pandn %xmm3, %xmm4
7285; SSE-NEXT:    packuswb %xmm2, %xmm2
7286; SSE-NEXT:    pand %xmm12, %xmm2
7287; SSE-NEXT:    por %xmm2, %xmm4
7288; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,0,0]
7289; SSE-NEXT:    movdqa %xmm8, %xmm2
7290; SSE-NEXT:    pandn %xmm4, %xmm2
7291; SSE-NEXT:    pand %xmm8, %xmm1
7292; SSE-NEXT:    por %xmm1, %xmm2
7293; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7294; SSE-NEXT:    movdqa 256(%rdi), %xmm2
7295; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7296; SSE-NEXT:    movdqa %xmm7, %xmm1
7297; SSE-NEXT:    pandn %xmm2, %xmm1
7298; SSE-NEXT:    movdqa 272(%rdi), %xmm2
7299; SSE-NEXT:    movdqa %xmm2, (%rsp) # 16-byte Spill
7300; SSE-NEXT:    pand %xmm7, %xmm2
7301; SSE-NEXT:    por %xmm1, %xmm2
7302; SSE-NEXT:    movdqa %xmm2, %xmm1
7303; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
7304; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
7305; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
7306; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
7307; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
7308; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6]
7309; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
7310; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
7311; SSE-NEXT:    packuswb %xmm1, %xmm2
7312; SSE-NEXT:    movdqa %xmm0, %xmm3
7313; SSE-NEXT:    pandn %xmm2, %xmm3
7314; SSE-NEXT:    movdqa 240(%rdi), %xmm2
7315; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7316; SSE-NEXT:    movdqa %xmm11, %xmm1
7317; SSE-NEXT:    pandn %xmm2, %xmm1
7318; SSE-NEXT:    movdqa 224(%rdi), %xmm2
7319; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7320; SSE-NEXT:    pand %xmm11, %xmm2
7321; SSE-NEXT:    por %xmm1, %xmm2
7322; SSE-NEXT:    movdqa %xmm2, %xmm1
7323; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
7324; SSE-NEXT:    movdqa %xmm10, %xmm4
7325; SSE-NEXT:    pandn %xmm1, %xmm4
7326; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
7327; SSE-NEXT:    pand %xmm10, %xmm2
7328; SSE-NEXT:    por %xmm4, %xmm2
7329; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3]
7330; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
7331; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1]
7332; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
7333; SSE-NEXT:    packuswb %xmm1, %xmm1
7334; SSE-NEXT:    pand %xmm0, %xmm1
7335; SSE-NEXT:    por %xmm3, %xmm1
7336; SSE-NEXT:    movdqa 288(%rdi), %xmm3
7337; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7338; SSE-NEXT:    movdqa %xmm9, %xmm2
7339; SSE-NEXT:    pandn %xmm3, %xmm2
7340; SSE-NEXT:    movdqa 304(%rdi), %xmm3
7341; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7342; SSE-NEXT:    pand %xmm9, %xmm3
7343; SSE-NEXT:    por %xmm2, %xmm3
7344; SSE-NEXT:    movdqa %xmm3, %xmm2
7345; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
7346; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7]
7347; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15]
7348; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
7349; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
7350; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
7351; SSE-NEXT:    movdqa 320(%rdi), %xmm3
7352; SSE-NEXT:    movdqa %xmm3, %xmm4
7353; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
7354; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7355; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
7356; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7357; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
7358; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
7359; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
7360; SSE-NEXT:    packuswb %xmm3, %xmm3
7361; SSE-NEXT:    movdqa %xmm12, %xmm4
7362; SSE-NEXT:    pandn %xmm3, %xmm4
7363; SSE-NEXT:    packuswb %xmm2, %xmm2
7364; SSE-NEXT:    pand %xmm12, %xmm2
7365; SSE-NEXT:    por %xmm2, %xmm4
7366; SSE-NEXT:    movdqa %xmm8, %xmm2
7367; SSE-NEXT:    pandn %xmm4, %xmm2
7368; SSE-NEXT:    pand %xmm8, %xmm1
7369; SSE-NEXT:    por %xmm1, %xmm2
7370; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7371; SSE-NEXT:    movdqa 368(%rdi), %xmm2
7372; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7373; SSE-NEXT:    movdqa %xmm7, %xmm1
7374; SSE-NEXT:    pandn %xmm2, %xmm1
7375; SSE-NEXT:    movdqa 384(%rdi), %xmm2
7376; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7377; SSE-NEXT:    pand %xmm7, %xmm2
7378; SSE-NEXT:    por %xmm1, %xmm2
7379; SSE-NEXT:    movdqa %xmm2, %xmm1
7380; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
7381; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
7382; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
7383; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
7384; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
7385; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6]
7386; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
7387; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
7388; SSE-NEXT:    packuswb %xmm1, %xmm2
7389; SSE-NEXT:    movdqa %xmm0, %xmm3
7390; SSE-NEXT:    pandn %xmm2, %xmm3
7391; SSE-NEXT:    movdqa 352(%rdi), %xmm2
7392; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7393; SSE-NEXT:    movdqa %xmm11, %xmm1
7394; SSE-NEXT:    pandn %xmm2, %xmm1
7395; SSE-NEXT:    movdqa 336(%rdi), %xmm2
7396; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7397; SSE-NEXT:    pand %xmm11, %xmm2
7398; SSE-NEXT:    por %xmm1, %xmm2
7399; SSE-NEXT:    movdqa %xmm2, %xmm1
7400; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
7401; SSE-NEXT:    movdqa %xmm10, %xmm4
7402; SSE-NEXT:    pandn %xmm1, %xmm4
7403; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
7404; SSE-NEXT:    pand %xmm10, %xmm2
7405; SSE-NEXT:    por %xmm4, %xmm2
7406; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3]
7407; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
7408; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1]
7409; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
7410; SSE-NEXT:    packuswb %xmm1, %xmm1
7411; SSE-NEXT:    pand %xmm0, %xmm1
7412; SSE-NEXT:    por %xmm3, %xmm1
7413; SSE-NEXT:    movdqa 400(%rdi), %xmm3
7414; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7415; SSE-NEXT:    movdqa %xmm9, %xmm2
7416; SSE-NEXT:    pandn %xmm3, %xmm2
7417; SSE-NEXT:    movdqa 416(%rdi), %xmm14
7418; SSE-NEXT:    movdqa %xmm14, %xmm3
7419; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7420; SSE-NEXT:    pand %xmm9, %xmm3
7421; SSE-NEXT:    por %xmm2, %xmm3
7422; SSE-NEXT:    movdqa %xmm3, %xmm2
7423; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
7424; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7]
7425; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15]
7426; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
7427; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
7428; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
7429; SSE-NEXT:    movdqa 432(%rdi), %xmm3
7430; SSE-NEXT:    movdqa %xmm3, %xmm4
7431; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
7432; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7433; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
7434; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7435; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
7436; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
7437; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
7438; SSE-NEXT:    packuswb %xmm3, %xmm3
7439; SSE-NEXT:    movdqa %xmm12, %xmm4
7440; SSE-NEXT:    pandn %xmm3, %xmm4
7441; SSE-NEXT:    packuswb %xmm2, %xmm2
7442; SSE-NEXT:    pand %xmm12, %xmm2
7443; SSE-NEXT:    por %xmm2, %xmm4
7444; SSE-NEXT:    movdqa %xmm8, %xmm2
7445; SSE-NEXT:    pandn %xmm4, %xmm2
7446; SSE-NEXT:    pand %xmm8, %xmm1
7447; SSE-NEXT:    por %xmm1, %xmm2
7448; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7449; SSE-NEXT:    movdqa 32(%rdi), %xmm15
7450; SSE-NEXT:    movdqa %xmm7, %xmm1
7451; SSE-NEXT:    pandn %xmm15, %xmm1
7452; SSE-NEXT:    movdqa 48(%rdi), %xmm2
7453; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7454; SSE-NEXT:    pand %xmm7, %xmm2
7455; SSE-NEXT:    por %xmm1, %xmm2
7456; SSE-NEXT:    movdqa %xmm2, %xmm1
7457; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
7458; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
7459; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
7460; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
7461; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
7462; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6]
7463; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
7464; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
7465; SSE-NEXT:    packuswb %xmm1, %xmm2
7466; SSE-NEXT:    movdqa 16(%rdi), %xmm3
7467; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7468; SSE-NEXT:    movdqa %xmm11, %xmm1
7469; SSE-NEXT:    pandn %xmm3, %xmm1
7470; SSE-NEXT:    movdqa (%rdi), %xmm4
7471; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7472; SSE-NEXT:    pand %xmm11, %xmm4
7473; SSE-NEXT:    por %xmm1, %xmm4
7474; SSE-NEXT:    movdqa %xmm4, %xmm1
7475; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
7476; SSE-NEXT:    movdqa %xmm10, %xmm5
7477; SSE-NEXT:    pandn %xmm1, %xmm5
7478; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
7479; SSE-NEXT:    pand %xmm10, %xmm4
7480; SSE-NEXT:    por %xmm5, %xmm4
7481; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,2,1,3]
7482; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
7483; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1]
7484; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
7485; SSE-NEXT:    packuswb %xmm1, %xmm1
7486; SSE-NEXT:    pand %xmm0, %xmm1
7487; SSE-NEXT:    pandn %xmm2, %xmm0
7488; SSE-NEXT:    por %xmm0, %xmm1
7489; SSE-NEXT:    movdqa 64(%rdi), %xmm2
7490; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7491; SSE-NEXT:    movdqa %xmm9, %xmm0
7492; SSE-NEXT:    pandn %xmm2, %xmm0
7493; SSE-NEXT:    movdqa 80(%rdi), %xmm2
7494; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7495; SSE-NEXT:    pand %xmm9, %xmm2
7496; SSE-NEXT:    por %xmm0, %xmm2
7497; SSE-NEXT:    movdqa %xmm2, %xmm0
7498; SSE-NEXT:    pxor %xmm5, %xmm5
7499; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
7500; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
7501; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
7502; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
7503; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
7504; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
7505; SSE-NEXT:    movdqa 96(%rdi), %xmm2
7506; SSE-NEXT:    movdqa %xmm2, %xmm3
7507; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
7508; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7509; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
7510; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7511; SSE-NEXT:    pxor %xmm7, %xmm7
7512; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
7513; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
7514; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
7515; SSE-NEXT:    packuswb %xmm2, %xmm2
7516; SSE-NEXT:    movdqa %xmm12, %xmm4
7517; SSE-NEXT:    pandn %xmm2, %xmm4
7518; SSE-NEXT:    packuswb %xmm0, %xmm0
7519; SSE-NEXT:    pand %xmm12, %xmm0
7520; SSE-NEXT:    por %xmm0, %xmm4
7521; SSE-NEXT:    pand %xmm8, %xmm1
7522; SSE-NEXT:    pandn %xmm4, %xmm8
7523; SSE-NEXT:    por %xmm1, %xmm8
7524; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7525; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,0,65535,65535]
7526; SSE-NEXT:    movdqa %xmm2, %xmm0
7527; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7528; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7529; SSE-NEXT:    pand %xmm2, %xmm1
7530; SSE-NEXT:    movdqa %xmm2, %xmm13
7531; SSE-NEXT:    por %xmm0, %xmm1
7532; SSE-NEXT:    movdqa %xmm1, %xmm2
7533; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
7534; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535]
7535; SSE-NEXT:    movdqa %xmm0, %xmm4
7536; SSE-NEXT:    pandn %xmm2, %xmm4
7537; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
7538; SSE-NEXT:    pand %xmm0, %xmm1
7539; SSE-NEXT:    por %xmm4, %xmm1
7540; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
7541; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
7542; SSE-NEXT:    psrld $16, %xmm2
7543; SSE-NEXT:    packuswb %xmm2, %xmm1
7544; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255]
7545; SSE-NEXT:    movdqa %xmm4, %xmm2
7546; SSE-NEXT:    movdqa %xmm4, %xmm8
7547; SSE-NEXT:    pandn %xmm1, %xmm2
7548; SSE-NEXT:    movdqa %xmm9, %xmm1
7549; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7550; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7551; SSE-NEXT:    pand %xmm9, %xmm4
7552; SSE-NEXT:    por %xmm1, %xmm4
7553; SSE-NEXT:    movdqa %xmm4, %xmm1
7554; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
7555; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,0,65535]
7556; SSE-NEXT:    movdqa %xmm6, %xmm5
7557; SSE-NEXT:    pandn %xmm1, %xmm5
7558; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
7559; SSE-NEXT:    pand %xmm6, %xmm4
7560; SSE-NEXT:    por %xmm5, %xmm4
7561; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3]
7562; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
7563; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
7564; SSE-NEXT:    packuswb %xmm1, %xmm1
7565; SSE-NEXT:    pand %xmm8, %xmm1
7566; SSE-NEXT:    por %xmm2, %xmm1
7567; SSE-NEXT:    movdqa %xmm11, %xmm2
7568; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7569; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7570; SSE-NEXT:    pand %xmm11, %xmm4
7571; SSE-NEXT:    por %xmm2, %xmm4
7572; SSE-NEXT:    movdqa %xmm4, %xmm2
7573; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
7574; SSE-NEXT:    movdqa %xmm10, %xmm5
7575; SSE-NEXT:    pandn %xmm2, %xmm5
7576; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
7577; SSE-NEXT:    pand %xmm10, %xmm4
7578; SSE-NEXT:    por %xmm5, %xmm4
7579; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7580; SSE-NEXT:    pslld $16, %xmm2
7581; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7582; SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
7583; SSE-NEXT:    packuswb %xmm5, %xmm2
7584; SSE-NEXT:    movdqa %xmm12, %xmm5
7585; SSE-NEXT:    pandn %xmm2, %xmm5
7586; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3]
7587; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7]
7588; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7]
7589; SSE-NEXT:    packuswb %xmm2, %xmm2
7590; SSE-NEXT:    pand %xmm12, %xmm2
7591; SSE-NEXT:    por %xmm2, %xmm5
7592; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
7593; SSE-NEXT:    movdqa %xmm4, %xmm2
7594; SSE-NEXT:    pandn %xmm5, %xmm2
7595; SSE-NEXT:    pand %xmm4, %xmm1
7596; SSE-NEXT:    movdqa %xmm4, %xmm3
7597; SSE-NEXT:    por %xmm1, %xmm2
7598; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7599; SSE-NEXT:    movdqa %xmm13, %xmm1
7600; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7601; SSE-NEXT:    movdqa (%rsp), %xmm2 # 16-byte Reload
7602; SSE-NEXT:    pand %xmm13, %xmm2
7603; SSE-NEXT:    por %xmm1, %xmm2
7604; SSE-NEXT:    movdqa %xmm2, %xmm1
7605; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15]
7606; SSE-NEXT:    movdqa %xmm0, %xmm4
7607; SSE-NEXT:    pandn %xmm1, %xmm4
7608; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
7609; SSE-NEXT:    pand %xmm0, %xmm2
7610; SSE-NEXT:    por %xmm4, %xmm2
7611; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
7612; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6]
7613; SSE-NEXT:    psrld $16, %xmm1
7614; SSE-NEXT:    packuswb %xmm1, %xmm2
7615; SSE-NEXT:    movdqa %xmm8, %xmm4
7616; SSE-NEXT:    pandn %xmm2, %xmm4
7617; SSE-NEXT:    movdqa %xmm9, %xmm1
7618; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7619; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7620; SSE-NEXT:    pand %xmm9, %xmm2
7621; SSE-NEXT:    por %xmm1, %xmm2
7622; SSE-NEXT:    movdqa %xmm2, %xmm1
7623; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
7624; SSE-NEXT:    movdqa %xmm6, %xmm5
7625; SSE-NEXT:    pandn %xmm1, %xmm5
7626; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
7627; SSE-NEXT:    pand %xmm6, %xmm2
7628; SSE-NEXT:    por %xmm5, %xmm2
7629; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3]
7630; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
7631; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
7632; SSE-NEXT:    packuswb %xmm1, %xmm1
7633; SSE-NEXT:    pand %xmm8, %xmm1
7634; SSE-NEXT:    por %xmm4, %xmm1
7635; SSE-NEXT:    movdqa %xmm11, %xmm2
7636; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7637; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7638; SSE-NEXT:    pand %xmm11, %xmm4
7639; SSE-NEXT:    por %xmm2, %xmm4
7640; SSE-NEXT:    movdqa %xmm4, %xmm2
7641; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
7642; SSE-NEXT:    movdqa %xmm10, %xmm5
7643; SSE-NEXT:    pandn %xmm2, %xmm5
7644; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
7645; SSE-NEXT:    pand %xmm10, %xmm4
7646; SSE-NEXT:    por %xmm5, %xmm4
7647; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7648; SSE-NEXT:    pslld $16, %xmm2
7649; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7650; SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
7651; SSE-NEXT:    packuswb %xmm5, %xmm2
7652; SSE-NEXT:    movdqa %xmm12, %xmm5
7653; SSE-NEXT:    pandn %xmm2, %xmm5
7654; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3]
7655; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7]
7656; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7]
7657; SSE-NEXT:    packuswb %xmm2, %xmm2
7658; SSE-NEXT:    pand %xmm12, %xmm2
7659; SSE-NEXT:    por %xmm2, %xmm5
7660; SSE-NEXT:    movdqa %xmm3, %xmm2
7661; SSE-NEXT:    pandn %xmm5, %xmm2
7662; SSE-NEXT:    pand %xmm3, %xmm1
7663; SSE-NEXT:    por %xmm1, %xmm2
7664; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7665; SSE-NEXT:    movdqa %xmm13, %xmm1
7666; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7667; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7668; SSE-NEXT:    pand %xmm13, %xmm2
7669; SSE-NEXT:    por %xmm1, %xmm2
7670; SSE-NEXT:    movdqa %xmm2, %xmm1
7671; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15]
7672; SSE-NEXT:    movdqa %xmm0, %xmm4
7673; SSE-NEXT:    pandn %xmm1, %xmm4
7674; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
7675; SSE-NEXT:    pand %xmm0, %xmm2
7676; SSE-NEXT:    por %xmm4, %xmm2
7677; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
7678; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6]
7679; SSE-NEXT:    psrld $16, %xmm1
7680; SSE-NEXT:    packuswb %xmm1, %xmm2
7681; SSE-NEXT:    movdqa %xmm8, %xmm4
7682; SSE-NEXT:    pandn %xmm2, %xmm4
7683; SSE-NEXT:    movdqa %xmm9, %xmm1
7684; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7685; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7686; SSE-NEXT:    pand %xmm9, %xmm2
7687; SSE-NEXT:    por %xmm1, %xmm2
7688; SSE-NEXT:    movdqa %xmm2, %xmm1
7689; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
7690; SSE-NEXT:    movdqa %xmm6, %xmm5
7691; SSE-NEXT:    pandn %xmm1, %xmm5
7692; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
7693; SSE-NEXT:    pand %xmm6, %xmm2
7694; SSE-NEXT:    por %xmm5, %xmm2
7695; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3]
7696; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
7697; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
7698; SSE-NEXT:    packuswb %xmm1, %xmm1
7699; SSE-NEXT:    pand %xmm8, %xmm1
7700; SSE-NEXT:    por %xmm4, %xmm1
7701; SSE-NEXT:    movdqa %xmm11, %xmm2
7702; SSE-NEXT:    pandn %xmm14, %xmm2
7703; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7704; SSE-NEXT:    pand %xmm11, %xmm4
7705; SSE-NEXT:    por %xmm2, %xmm4
7706; SSE-NEXT:    movdqa %xmm4, %xmm2
7707; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
7708; SSE-NEXT:    movdqa %xmm10, %xmm5
7709; SSE-NEXT:    pandn %xmm2, %xmm5
7710; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
7711; SSE-NEXT:    pand %xmm10, %xmm4
7712; SSE-NEXT:    por %xmm5, %xmm4
7713; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7714; SSE-NEXT:    pslld $16, %xmm2
7715; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7716; SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
7717; SSE-NEXT:    packuswb %xmm5, %xmm2
7718; SSE-NEXT:    movdqa %xmm12, %xmm5
7719; SSE-NEXT:    pandn %xmm2, %xmm5
7720; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3]
7721; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7]
7722; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7]
7723; SSE-NEXT:    packuswb %xmm2, %xmm2
7724; SSE-NEXT:    pand %xmm12, %xmm2
7725; SSE-NEXT:    por %xmm2, %xmm5
7726; SSE-NEXT:    movdqa %xmm3, %xmm2
7727; SSE-NEXT:    pandn %xmm5, %xmm2
7728; SSE-NEXT:    pand %xmm3, %xmm1
7729; SSE-NEXT:    por %xmm1, %xmm2
7730; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7731; SSE-NEXT:    movdqa %xmm13, %xmm1
7732; SSE-NEXT:    pandn %xmm15, %xmm1
7733; SSE-NEXT:    movdqa %xmm15, %xmm3
7734; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7735; SSE-NEXT:    movdqa %xmm15, %xmm2
7736; SSE-NEXT:    pand %xmm13, %xmm2
7737; SSE-NEXT:    por %xmm1, %xmm2
7738; SSE-NEXT:    movdqa %xmm2, %xmm1
7739; SSE-NEXT:    pxor %xmm4, %xmm4
7740; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
7741; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
7742; SSE-NEXT:    pxor %xmm5, %xmm5
7743; SSE-NEXT:    pand %xmm0, %xmm2
7744; SSE-NEXT:    pandn %xmm1, %xmm0
7745; SSE-NEXT:    por %xmm2, %xmm0
7746; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
7747; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,6]
7748; SSE-NEXT:    psrld $16, %xmm1
7749; SSE-NEXT:    packuswb %xmm1, %xmm0
7750; SSE-NEXT:    movdqa %xmm8, %xmm1
7751; SSE-NEXT:    pandn %xmm0, %xmm1
7752; SSE-NEXT:    movdqa %xmm9, %xmm0
7753; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7754; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7755; SSE-NEXT:    pand %xmm9, %xmm2
7756; SSE-NEXT:    movdqa %xmm9, %xmm12
7757; SSE-NEXT:    por %xmm0, %xmm2
7758; SSE-NEXT:    movdqa %xmm2, %xmm0
7759; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
7760; SSE-NEXT:    movdqa %xmm6, %xmm4
7761; SSE-NEXT:    pandn %xmm0, %xmm4
7762; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
7763; SSE-NEXT:    pand %xmm6, %xmm2
7764; SSE-NEXT:    por %xmm4, %xmm2
7765; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3]
7766; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
7767; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
7768; SSE-NEXT:    packuswb %xmm0, %xmm0
7769; SSE-NEXT:    pand %xmm8, %xmm0
7770; SSE-NEXT:    por %xmm1, %xmm0
7771; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7772; SSE-NEXT:    movdqa %xmm11, %xmm0
7773; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
7774; SSE-NEXT:    pandn %xmm14, %xmm0
7775; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7776; SSE-NEXT:    pand %xmm11, %xmm1
7777; SSE-NEXT:    por %xmm0, %xmm1
7778; SSE-NEXT:    movdqa %xmm1, %xmm0
7779; SSE-NEXT:    pxor %xmm2, %xmm2
7780; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
7781; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
7782; SSE-NEXT:    pand %xmm10, %xmm1
7783; SSE-NEXT:    pandn %xmm0, %xmm10
7784; SSE-NEXT:    por %xmm1, %xmm10
7785; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7786; SSE-NEXT:    movdqa %xmm11, %xmm0
7787; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
7788; SSE-NEXT:    pandn %xmm7, %xmm0
7789; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7790; SSE-NEXT:    movdqa %xmm2, %xmm1
7791; SSE-NEXT:    pand %xmm11, %xmm1
7792; SSE-NEXT:    por %xmm0, %xmm1
7793; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7794; SSE-NEXT:    movdqa %xmm11, %xmm0
7795; SSE-NEXT:    movdqa (%rsp), %xmm8 # 16-byte Reload
7796; SSE-NEXT:    pandn %xmm8, %xmm0
7797; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7798; SSE-NEXT:    movdqa %xmm5, %xmm1
7799; SSE-NEXT:    pand %xmm11, %xmm1
7800; SSE-NEXT:    por %xmm0, %xmm1
7801; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7802; SSE-NEXT:    movdqa %xmm11, %xmm0
7803; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
7804; SSE-NEXT:    pandn %xmm9, %xmm0
7805; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7806; SSE-NEXT:    movdqa %xmm1, %xmm4
7807; SSE-NEXT:    pand %xmm11, %xmm4
7808; SSE-NEXT:    por %xmm0, %xmm4
7809; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7810; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7811; SSE-NEXT:    movdqa %xmm3, %xmm0
7812; SSE-NEXT:    pand %xmm11, %xmm0
7813; SSE-NEXT:    movdqa %xmm15, %xmm6
7814; SSE-NEXT:    pandn %xmm15, %xmm11
7815; SSE-NEXT:    por %xmm0, %xmm11
7816; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7817; SSE-NEXT:    movdqa {{.*#+}} xmm15 = [65535,0,65535,65535,0,65535,65535,65535]
7818; SSE-NEXT:    movdqa %xmm15, %xmm0
7819; SSE-NEXT:    pandn %xmm2, %xmm0
7820; SSE-NEXT:    movdqa %xmm12, %xmm2
7821; SSE-NEXT:    movdqa %xmm7, %xmm4
7822; SSE-NEXT:    pandn %xmm7, %xmm2
7823; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7824; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3]
7825; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7826; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7827; SSE-NEXT:    pand %xmm15, %xmm4
7828; SSE-NEXT:    por %xmm0, %xmm4
7829; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7830; SSE-NEXT:    movdqa %xmm15, %xmm0
7831; SSE-NEXT:    pandn %xmm5, %xmm0
7832; SSE-NEXT:    movdqa %xmm12, %xmm2
7833; SSE-NEXT:    movdqa %xmm8, %xmm4
7834; SSE-NEXT:    pandn %xmm8, %xmm2
7835; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7836; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3]
7837; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7838; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7839; SSE-NEXT:    pand %xmm15, %xmm4
7840; SSE-NEXT:    por %xmm0, %xmm4
7841; SSE-NEXT:    movdqa %xmm4, (%rsp) # 16-byte Spill
7842; SSE-NEXT:    movdqa %xmm15, %xmm0
7843; SSE-NEXT:    pandn %xmm1, %xmm0
7844; SSE-NEXT:    movdqa %xmm12, %xmm2
7845; SSE-NEXT:    movdqa %xmm12, %xmm1
7846; SSE-NEXT:    movdqa %xmm9, %xmm4
7847; SSE-NEXT:    pandn %xmm9, %xmm1
7848; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7849; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm9[0,2,2,3]
7850; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7851; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7852; SSE-NEXT:    pand %xmm15, %xmm4
7853; SSE-NEXT:    por %xmm0, %xmm4
7854; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7855; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
7856; SSE-NEXT:    pand %xmm15, %xmm9
7857; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
7858; SSE-NEXT:    pand %xmm15, %xmm12
7859; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7860; SSE-NEXT:    pand %xmm15, %xmm0
7861; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7862; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7863; SSE-NEXT:    pand %xmm15, %xmm0
7864; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7865; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7866; SSE-NEXT:    pand %xmm15, %xmm0
7867; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7868; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7869; SSE-NEXT:    pand %xmm15, %xmm0
7870; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7871; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7872; SSE-NEXT:    pand %xmm15, %xmm0
7873; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7874; SSE-NEXT:    movdqa %xmm14, %xmm0
7875; SSE-NEXT:    pand %xmm15, %xmm0
7876; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7877; SSE-NEXT:    movdqa %xmm2, %xmm4
7878; SSE-NEXT:    movdqa %xmm6, %xmm0
7879; SSE-NEXT:    pandn %xmm6, %xmm4
7880; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7881; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,2,2,3]
7882; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7883; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7884; SSE-NEXT:    pand %xmm15, %xmm0
7885; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7886; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7887; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7888; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7889; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7890; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7891; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7892; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7893; SSE-NEXT:    pandn %xmm3, %xmm15
7894; SSE-NEXT:    por %xmm0, %xmm15
7895; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7896; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535]
7897; SSE-NEXT:    movdqa %xmm1, %xmm2
7898; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
7899; SSE-NEXT:    pandn %xmm7, %xmm2
7900; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7901; SSE-NEXT:    movdqa %xmm7, %xmm10
7902; SSE-NEXT:    movdqa %xmm7, %xmm4
7903; SSE-NEXT:    movdqa %xmm1, %xmm2
7904; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
7905; SSE-NEXT:    pandn %xmm6, %xmm2
7906; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7907; SSE-NEXT:    movdqa %xmm6, %xmm8
7908; SSE-NEXT:    movdqa %xmm1, %xmm2
7909; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7910; SSE-NEXT:    pandn %xmm5, %xmm2
7911; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7912; SSE-NEXT:    movdqa %xmm5, %xmm1
7913; SSE-NEXT:    movdqa %xmm5, %xmm11
7914; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7915; SSE-NEXT:    movdqa %xmm3, %xmm13
7916; SSE-NEXT:    pslld $16, %xmm13
7917; SSE-NEXT:    psrldq {{.*#+}} xmm10 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7918; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
7919; SSE-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
7920; SSE-NEXT:    movdqa %xmm6, %xmm0
7921; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7922; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7923; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
7924; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7925; SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7926; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7927; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7928; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7929; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7930; SSE-NEXT:    movdqa %xmm1, %xmm2
7931; SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7932; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
7933; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7934; SSE-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
7935; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7936; SSE-NEXT:    movdqa %xmm14, %xmm4
7937; SSE-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7]
7938; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7939; SSE-NEXT:    movdqa %xmm15, %xmm8
7940; SSE-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
7941; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7942; SSE-NEXT:    movdqa %xmm0, %xmm11
7943; SSE-NEXT:    movdqa %xmm1, %xmm0
7944; SSE-NEXT:    movdqa %xmm1, %xmm2
7945; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
7946; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7947; SSE-NEXT:    movdqa %xmm3, %xmm1
7948; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7949; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7950; SSE-NEXT:    pxor %xmm0, %xmm0
7951; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
7952; SSE-NEXT:    pshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,7,5,6,7]
7953; SSE-NEXT:    pshufd {{.*#+}} xmm14 = xmm14[0,1,2,0]
7954; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,6,4,6,5]
7955; SSE-NEXT:    movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,65535,0,65535]
7956; SSE-NEXT:    pand %xmm14, %xmm3
7957; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7958; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7959; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7960; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
7961; SSE-NEXT:    pshufhw {{.*#+}} xmm15 = xmm3[0,1,2,3,7,5,6,7]
7962; SSE-NEXT:    pshufd {{.*#+}} xmm15 = xmm15[0,1,2,0]
7963; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,6,4,6,5]
7964; SSE-NEXT:    pand %xmm14, %xmm3
7965; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7966; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7967; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7968; SSE-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
7969; SSE-NEXT:    pxor %xmm3, %xmm3
7970; SSE-NEXT:    pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,5,6,7]
7971; SSE-NEXT:    pshufd {{.*#+}} xmm15 = xmm15[0,1,2,0]
7972; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,6,4,6,5]
7973; SSE-NEXT:    pand %xmm14, %xmm0
7974; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7975; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7976; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7977; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
7978; SSE-NEXT:    pshufhw {{.*#+}} xmm15 = xmm0[0,1,2,3,7,5,6,7]
7979; SSE-NEXT:    pshufd {{.*#+}} xmm15 = xmm15[0,1,2,0]
7980; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,6,4,6,5]
7981; SSE-NEXT:    movdqa %xmm14, %xmm0
7982; SSE-NEXT:    pand %xmm14, %xmm3
7983; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7984; SSE-NEXT:    movdqa %xmm14, %xmm3
7985; SSE-NEXT:    pandn %xmm4, %xmm3
7986; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7987; SSE-NEXT:    pand %xmm14, %xmm7
7988; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7989; SSE-NEXT:    movdqa %xmm14, %xmm3
7990; SSE-NEXT:    pandn %xmm8, %xmm3
7991; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7992; SSE-NEXT:    pand %xmm14, %xmm6
7993; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7994; SSE-NEXT:    movdqa %xmm14, %xmm3
7995; SSE-NEXT:    movdqa %xmm11, %xmm6
7996; SSE-NEXT:    pandn %xmm11, %xmm3
7997; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7998; SSE-NEXT:    pand %xmm14, %xmm5
7999; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8000; SSE-NEXT:    movdqa %xmm2, %xmm3
8001; SSE-NEXT:    pand %xmm14, %xmm3
8002; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8003; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8004; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8005; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8006; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8007; SSE-NEXT:    pandn %xmm1, %xmm0
8008; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8009; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8010; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8011; SSE-NEXT:    pxor %xmm0, %xmm0
8012; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
8013; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535]
8014; SSE-NEXT:    pand %xmm11, %xmm3
8015; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8016; SSE-NEXT:    pand %xmm11, %xmm4
8017; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8018; SSE-NEXT:    movdqa (%rsp), %xmm3 # 16-byte Reload
8019; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8020; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
8021; SSE-NEXT:    pand %xmm11, %xmm3
8022; SSE-NEXT:    movdqa %xmm3, (%rsp) # 16-byte Spill
8023; SSE-NEXT:    pand %xmm11, %xmm8
8024; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8025; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8026; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8027; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
8028; SSE-NEXT:    pxor %xmm8, %xmm8
8029; SSE-NEXT:    pand %xmm11, %xmm3
8030; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8031; SSE-NEXT:    pand %xmm11, %xmm6
8032; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8033; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8034; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8035; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
8036; SSE-NEXT:    pand %xmm11, %xmm0
8037; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8038; SSE-NEXT:    pand %xmm11, %xmm1
8039; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8040; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8041; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8042; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8043; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8044; SSE-NEXT:    pandn %xmm2, %xmm11
8045; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8046; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3]
8047; SSE-NEXT:    packuswb %xmm2, %xmm3
8048; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,65535,0]
8049; SSE-NEXT:    movdqa %xmm0, %xmm15
8050; SSE-NEXT:    pandn %xmm3, %xmm15
8051; SSE-NEXT:    pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8052; SSE-NEXT:    # xmm3 = mem[0,3,2,3]
8053; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7]
8054; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,6,7]
8055; SSE-NEXT:    packuswb %xmm3, %xmm3
8056; SSE-NEXT:    pand %xmm0, %xmm3
8057; SSE-NEXT:    movdqa %xmm0, %xmm4
8058; SSE-NEXT:    por %xmm3, %xmm15
8059; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
8060; SSE-NEXT:    movdqa %xmm0, %xmm3
8061; SSE-NEXT:    pandn %xmm15, %xmm3
8062; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8063; SSE-NEXT:    pand %xmm0, %xmm2
8064; SSE-NEXT:    movdqa %xmm0, %xmm13
8065; SSE-NEXT:    por %xmm2, %xmm3
8066; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8067; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,0,65535]
8068; SSE-NEXT:    movdqa %xmm0, %xmm3
8069; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8070; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8071; SSE-NEXT:    pand %xmm0, %xmm15
8072; SSE-NEXT:    movdqa %xmm0, %xmm5
8073; SSE-NEXT:    por %xmm3, %xmm15
8074; SSE-NEXT:    movdqa %xmm15, %xmm3
8075; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
8076; SSE-NEXT:    movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,65535,65535,0,65535]
8077; SSE-NEXT:    movdqa %xmm14, %xmm0
8078; SSE-NEXT:    pandn %xmm3, %xmm0
8079; SSE-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15]
8080; SSE-NEXT:    pand %xmm14, %xmm15
8081; SSE-NEXT:    por %xmm0, %xmm15
8082; SSE-NEXT:    packuswb %xmm10, %xmm0
8083; SSE-NEXT:    movdqa %xmm4, %xmm2
8084; SSE-NEXT:    pandn %xmm0, %xmm2
8085; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[0,3,2,3]
8086; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
8087; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
8088; SSE-NEXT:    packuswb %xmm0, %xmm0
8089; SSE-NEXT:    pand %xmm4, %xmm0
8090; SSE-NEXT:    por %xmm0, %xmm2
8091; SSE-NEXT:    movdqa %xmm13, %xmm3
8092; SSE-NEXT:    pandn %xmm2, %xmm3
8093; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8094; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8095; SSE-NEXT:    por %xmm0, %xmm9
8096; SSE-NEXT:    movdqa %xmm9, %xmm0
8097; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
8098; SSE-NEXT:    movdqa %xmm14, %xmm2
8099; SSE-NEXT:    pandn %xmm0, %xmm2
8100; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
8101; SSE-NEXT:    pand %xmm14, %xmm9
8102; SSE-NEXT:    por %xmm2, %xmm9
8103; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8104; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,3,2,3]
8105; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8106; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8107; SSE-NEXT:    movdqa %xmm1, %xmm2
8108; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8109; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535]
8110; SSE-NEXT:    movdqa %xmm0, %xmm15
8111; SSE-NEXT:    pandn %xmm2, %xmm15
8112; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
8113; SSE-NEXT:    pand %xmm0, %xmm1
8114; SSE-NEXT:    por %xmm15, %xmm1
8115; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm1[0,1,2,1]
8116; SSE-NEXT:    pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7]
8117; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
8118; SSE-NEXT:    packuswb %xmm2, %xmm11
8119; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255]
8120; SSE-NEXT:    movdqa %xmm6, %xmm2
8121; SSE-NEXT:    pandn %xmm11, %xmm2
8122; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm9[0,2,1,3]
8123; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
8124; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,3,3]
8125; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7]
8126; SSE-NEXT:    packuswb %xmm1, %xmm1
8127; SSE-NEXT:    pand %xmm6, %xmm1
8128; SSE-NEXT:    por %xmm1, %xmm2
8129; SSE-NEXT:    pand %xmm13, %xmm2
8130; SSE-NEXT:    por %xmm3, %xmm2
8131; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8132; SSE-NEXT:    movdqa %xmm5, %xmm15
8133; SSE-NEXT:    movdqa %xmm5, %xmm1
8134; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8135; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8136; SSE-NEXT:    pand %xmm5, %xmm2
8137; SSE-NEXT:    por %xmm1, %xmm2
8138; SSE-NEXT:    movdqa %xmm2, %xmm1
8139; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
8140; SSE-NEXT:    movdqa %xmm14, %xmm3
8141; SSE-NEXT:    pandn %xmm1, %xmm3
8142; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8143; SSE-NEXT:    pand %xmm14, %xmm2
8144; SSE-NEXT:    por %xmm3, %xmm2
8145; SSE-NEXT:    packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8146; SSE-NEXT:    movdqa %xmm4, %xmm3
8147; SSE-NEXT:    pandn %xmm1, %xmm3
8148; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3]
8149; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
8150; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
8151; SSE-NEXT:    packuswb %xmm1, %xmm1
8152; SSE-NEXT:    pand %xmm4, %xmm1
8153; SSE-NEXT:    movdqa %xmm4, %xmm10
8154; SSE-NEXT:    por %xmm1, %xmm3
8155; SSE-NEXT:    movdqa %xmm13, %xmm1
8156; SSE-NEXT:    pandn %xmm3, %xmm1
8157; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8158; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8159; SSE-NEXT:    por %xmm2, %xmm12
8160; SSE-NEXT:    movdqa %xmm12, %xmm2
8161; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8162; SSE-NEXT:    movdqa %xmm14, %xmm3
8163; SSE-NEXT:    pandn %xmm2, %xmm3
8164; SSE-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
8165; SSE-NEXT:    pand %xmm14, %xmm12
8166; SSE-NEXT:    por %xmm3, %xmm12
8167; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
8168; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm11[1,3,2,3]
8169; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8170; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
8171; SSE-NEXT:    movdqa %xmm4, %xmm2
8172; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8173; SSE-NEXT:    movdqa %xmm0, %xmm3
8174; SSE-NEXT:    pandn %xmm2, %xmm3
8175; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
8176; SSE-NEXT:    pand %xmm0, %xmm4
8177; SSE-NEXT:    por %xmm3, %xmm4
8178; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,1,2,1]
8179; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
8180; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
8181; SSE-NEXT:    packuswb %xmm2, %xmm3
8182; SSE-NEXT:    movdqa %xmm6, %xmm4
8183; SSE-NEXT:    pandn %xmm3, %xmm4
8184; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm12[0,2,1,3]
8185; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
8186; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,3,3]
8187; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7]
8188; SSE-NEXT:    packuswb %xmm2, %xmm2
8189; SSE-NEXT:    pand %xmm6, %xmm2
8190; SSE-NEXT:    por %xmm2, %xmm4
8191; SSE-NEXT:    pand %xmm13, %xmm4
8192; SSE-NEXT:    por %xmm1, %xmm4
8193; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8194; SSE-NEXT:    movdqa %xmm5, %xmm1
8195; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8196; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8197; SSE-NEXT:    pand %xmm5, %xmm2
8198; SSE-NEXT:    por %xmm1, %xmm2
8199; SSE-NEXT:    movdqa %xmm2, %xmm1
8200; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
8201; SSE-NEXT:    movdqa %xmm14, %xmm3
8202; SSE-NEXT:    pandn %xmm1, %xmm3
8203; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8204; SSE-NEXT:    pand %xmm14, %xmm2
8205; SSE-NEXT:    por %xmm3, %xmm2
8206; SSE-NEXT:    packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8207; SSE-NEXT:    movdqa %xmm10, %xmm3
8208; SSE-NEXT:    pandn %xmm1, %xmm3
8209; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3]
8210; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
8211; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
8212; SSE-NEXT:    packuswb %xmm1, %xmm1
8213; SSE-NEXT:    pand %xmm10, %xmm1
8214; SSE-NEXT:    por %xmm1, %xmm3
8215; SSE-NEXT:    movdqa %xmm13, %xmm1
8216; SSE-NEXT:    pandn %xmm3, %xmm1
8217; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8218; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8219; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8220; SSE-NEXT:    por %xmm2, %xmm4
8221; SSE-NEXT:    movdqa %xmm4, %xmm2
8222; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8223; SSE-NEXT:    movdqa %xmm14, %xmm3
8224; SSE-NEXT:    pandn %xmm2, %xmm3
8225; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
8226; SSE-NEXT:    pand %xmm14, %xmm4
8227; SSE-NEXT:    por %xmm3, %xmm4
8228; SSE-NEXT:    movdqa %xmm4, %xmm5
8229; SSE-NEXT:    pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8230; SSE-NEXT:    # xmm2 = mem[1,3,2,3]
8231; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8232; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
8233; SSE-NEXT:    movdqa %xmm4, %xmm2
8234; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8235; SSE-NEXT:    movdqa %xmm0, %xmm3
8236; SSE-NEXT:    pandn %xmm2, %xmm3
8237; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
8238; SSE-NEXT:    pand %xmm0, %xmm4
8239; SSE-NEXT:    por %xmm3, %xmm4
8240; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,1,2,1]
8241; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
8242; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
8243; SSE-NEXT:    packuswb %xmm2, %xmm3
8244; SSE-NEXT:    movdqa %xmm6, %xmm4
8245; SSE-NEXT:    pandn %xmm3, %xmm4
8246; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[0,2,1,3]
8247; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
8248; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,3,3]
8249; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7]
8250; SSE-NEXT:    packuswb %xmm2, %xmm2
8251; SSE-NEXT:    pand %xmm6, %xmm2
8252; SSE-NEXT:    por %xmm2, %xmm4
8253; SSE-NEXT:    pand %xmm13, %xmm4
8254; SSE-NEXT:    por %xmm1, %xmm4
8255; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8256; SSE-NEXT:    movdqa %xmm15, %xmm1
8257; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8258; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8259; SSE-NEXT:    pand %xmm15, %xmm2
8260; SSE-NEXT:    por %xmm1, %xmm2
8261; SSE-NEXT:    movdqa %xmm2, %xmm1
8262; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
8263; SSE-NEXT:    movdqa %xmm14, %xmm3
8264; SSE-NEXT:    pandn %xmm1, %xmm3
8265; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8266; SSE-NEXT:    pand %xmm14, %xmm2
8267; SSE-NEXT:    por %xmm3, %xmm2
8268; SSE-NEXT:    packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8269; SSE-NEXT:    movdqa %xmm10, %xmm3
8270; SSE-NEXT:    pandn %xmm1, %xmm3
8271; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3]
8272; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
8273; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
8274; SSE-NEXT:    packuswb %xmm1, %xmm1
8275; SSE-NEXT:    pand %xmm10, %xmm1
8276; SSE-NEXT:    por %xmm1, %xmm3
8277; SSE-NEXT:    movdqa %xmm13, %xmm1
8278; SSE-NEXT:    pandn %xmm3, %xmm1
8279; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8280; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8281; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8282; SSE-NEXT:    por %xmm2, %xmm4
8283; SSE-NEXT:    movdqa %xmm4, %xmm2
8284; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8285; SSE-NEXT:    movdqa %xmm14, %xmm3
8286; SSE-NEXT:    pandn %xmm2, %xmm3
8287; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
8288; SSE-NEXT:    pand %xmm14, %xmm4
8289; SSE-NEXT:    por %xmm3, %xmm4
8290; SSE-NEXT:    pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8291; SSE-NEXT:    # xmm2 = mem[1,3,2,3]
8292; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8293; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
8294; SSE-NEXT:    movdqa %xmm3, %xmm2
8295; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8296; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
8297; SSE-NEXT:    pand %xmm0, %xmm3
8298; SSE-NEXT:    pandn %xmm2, %xmm0
8299; SSE-NEXT:    por %xmm3, %xmm0
8300; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
8301; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
8302; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
8303; SSE-NEXT:    packuswb %xmm2, %xmm0
8304; SSE-NEXT:    movdqa %xmm6, %xmm2
8305; SSE-NEXT:    pandn %xmm0, %xmm2
8306; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,2,1,3]
8307; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
8308; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,3,3]
8309; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
8310; SSE-NEXT:    packuswb %xmm0, %xmm0
8311; SSE-NEXT:    pand %xmm6, %xmm0
8312; SSE-NEXT:    por %xmm0, %xmm2
8313; SSE-NEXT:    pand %xmm13, %xmm2
8314; SSE-NEXT:    por %xmm1, %xmm2
8315; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8316; SSE-NEXT:    movdqa %xmm15, %xmm9
8317; SSE-NEXT:    movdqa %xmm15, %xmm0
8318; SSE-NEXT:    pandn %xmm7, %xmm0
8319; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8320; SSE-NEXT:    pand %xmm15, %xmm1
8321; SSE-NEXT:    por %xmm0, %xmm1
8322; SSE-NEXT:    movdqa %xmm1, %xmm0
8323; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
8324; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
8325; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
8326; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
8327; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
8328; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
8329; SSE-NEXT:    psrlq $48, %xmm0
8330; SSE-NEXT:    packuswb %xmm0, %xmm1
8331; SSE-NEXT:    movdqa %xmm6, %xmm0
8332; SSE-NEXT:    pandn %xmm1, %xmm0
8333; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,65535,0,65535,65535]
8334; SSE-NEXT:    movdqa %xmm12, %xmm1
8335; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8336; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8337; SSE-NEXT:    pand %xmm12, %xmm2
8338; SSE-NEXT:    por %xmm1, %xmm2
8339; SSE-NEXT:    movdqa %xmm2, %xmm1
8340; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
8341; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [65535,0,65535,0,65535,65535,65535,65535]
8342; SSE-NEXT:    movdqa %xmm5, %xmm3
8343; SSE-NEXT:    pandn %xmm1, %xmm3
8344; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8345; SSE-NEXT:    pand %xmm5, %xmm2
8346; SSE-NEXT:    por %xmm3, %xmm2
8347; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[3,2,1,0,4,5,6,7]
8348; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
8349; SSE-NEXT:    packuswb %xmm1, %xmm1
8350; SSE-NEXT:    pand %xmm6, %xmm1
8351; SSE-NEXT:    por %xmm0, %xmm1
8352; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8353; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8354; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8355; SSE-NEXT:    por %xmm0, %xmm3
8356; SSE-NEXT:    movdqa %xmm3, %xmm0
8357; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
8358; SSE-NEXT:    movdqa %xmm14, %xmm2
8359; SSE-NEXT:    pandn %xmm0, %xmm2
8360; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
8361; SSE-NEXT:    pand %xmm14, %xmm3
8362; SSE-NEXT:    por %xmm2, %xmm3
8363; SSE-NEXT:    pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8364; SSE-NEXT:    # xmm0 = mem[2,1,2,3,4,5,6,7]
8365; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8366; SSE-NEXT:    packuswb %xmm0, %xmm0
8367; SSE-NEXT:    movdqa %xmm10, %xmm7
8368; SSE-NEXT:    movdqa %xmm10, %xmm2
8369; SSE-NEXT:    pandn %xmm0, %xmm2
8370; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm3[0,2,1,0,4,5,6,7]
8371; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
8372; SSE-NEXT:    packuswb %xmm0, %xmm0
8373; SSE-NEXT:    pand %xmm10, %xmm0
8374; SSE-NEXT:    por %xmm0, %xmm2
8375; SSE-NEXT:    movdqa %xmm13, %xmm10
8376; SSE-NEXT:    movdqa %xmm13, %xmm0
8377; SSE-NEXT:    pandn %xmm2, %xmm0
8378; SSE-NEXT:    pand %xmm13, %xmm1
8379; SSE-NEXT:    por %xmm1, %xmm0
8380; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8381; SSE-NEXT:    movdqa %xmm15, %xmm0
8382; SSE-NEXT:    pandn %xmm11, %xmm0
8383; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8384; SSE-NEXT:    pand %xmm15, %xmm1
8385; SSE-NEXT:    por %xmm0, %xmm1
8386; SSE-NEXT:    movdqa %xmm1, %xmm0
8387; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
8388; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
8389; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
8390; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
8391; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
8392; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
8393; SSE-NEXT:    psrlq $48, %xmm0
8394; SSE-NEXT:    packuswb %xmm0, %xmm1
8395; SSE-NEXT:    movdqa %xmm6, %xmm0
8396; SSE-NEXT:    pandn %xmm1, %xmm0
8397; SSE-NEXT:    movdqa %xmm12, %xmm1
8398; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8399; SSE-NEXT:    pandn %xmm15, %xmm1
8400; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8401; SSE-NEXT:    movdqa %xmm4, %xmm2
8402; SSE-NEXT:    pand %xmm12, %xmm2
8403; SSE-NEXT:    por %xmm1, %xmm2
8404; SSE-NEXT:    movdqa %xmm2, %xmm1
8405; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
8406; SSE-NEXT:    movdqa %xmm5, %xmm3
8407; SSE-NEXT:    pandn %xmm1, %xmm3
8408; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8409; SSE-NEXT:    pand %xmm5, %xmm2
8410; SSE-NEXT:    por %xmm3, %xmm2
8411; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[3,2,1,0,4,5,6,7]
8412; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
8413; SSE-NEXT:    packuswb %xmm1, %xmm1
8414; SSE-NEXT:    pand %xmm6, %xmm1
8415; SSE-NEXT:    por %xmm0, %xmm1
8416; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8417; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8418; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8419; SSE-NEXT:    por %xmm0, %xmm3
8420; SSE-NEXT:    movdqa %xmm3, %xmm0
8421; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
8422; SSE-NEXT:    movdqa %xmm14, %xmm2
8423; SSE-NEXT:    pandn %xmm0, %xmm2
8424; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
8425; SSE-NEXT:    pand %xmm14, %xmm3
8426; SSE-NEXT:    por %xmm2, %xmm3
8427; SSE-NEXT:    pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8428; SSE-NEXT:    # xmm0 = mem[2,1,2,3,4,5,6,7]
8429; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8430; SSE-NEXT:    packuswb %xmm0, %xmm0
8431; SSE-NEXT:    movdqa %xmm7, %xmm2
8432; SSE-NEXT:    pandn %xmm0, %xmm2
8433; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm3[0,2,1,0,4,5,6,7]
8434; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
8435; SSE-NEXT:    packuswb %xmm0, %xmm0
8436; SSE-NEXT:    pand %xmm7, %xmm0
8437; SSE-NEXT:    por %xmm0, %xmm2
8438; SSE-NEXT:    movdqa %xmm13, %xmm0
8439; SSE-NEXT:    pandn %xmm2, %xmm0
8440; SSE-NEXT:    pand %xmm13, %xmm1
8441; SSE-NEXT:    por %xmm1, %xmm0
8442; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8443; SSE-NEXT:    movdqa %xmm9, %xmm0
8444; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8445; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8446; SSE-NEXT:    pand %xmm9, %xmm1
8447; SSE-NEXT:    movdqa %xmm9, %xmm13
8448; SSE-NEXT:    por %xmm0, %xmm1
8449; SSE-NEXT:    movdqa %xmm1, %xmm0
8450; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
8451; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
8452; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
8453; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
8454; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
8455; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
8456; SSE-NEXT:    psrlq $48, %xmm0
8457; SSE-NEXT:    packuswb %xmm0, %xmm1
8458; SSE-NEXT:    movdqa %xmm6, %xmm0
8459; SSE-NEXT:    pandn %xmm1, %xmm0
8460; SSE-NEXT:    movdqa %xmm12, %xmm1
8461; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8462; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8463; SSE-NEXT:    movdqa %xmm9, %xmm2
8464; SSE-NEXT:    pand %xmm12, %xmm2
8465; SSE-NEXT:    por %xmm1, %xmm2
8466; SSE-NEXT:    movdqa %xmm2, %xmm1
8467; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
8468; SSE-NEXT:    movdqa %xmm5, %xmm3
8469; SSE-NEXT:    pandn %xmm1, %xmm3
8470; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8471; SSE-NEXT:    pand %xmm5, %xmm2
8472; SSE-NEXT:    por %xmm3, %xmm2
8473; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[3,2,1,0,4,5,6,7]
8474; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
8475; SSE-NEXT:    packuswb %xmm1, %xmm1
8476; SSE-NEXT:    pand %xmm6, %xmm1
8477; SSE-NEXT:    por %xmm0, %xmm1
8478; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
8479; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8480; SSE-NEXT:    pandn %xmm11, %xmm0
8481; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8482; SSE-NEXT:    por %xmm0, %xmm3
8483; SSE-NEXT:    movdqa %xmm3, %xmm0
8484; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
8485; SSE-NEXT:    movdqa %xmm14, %xmm2
8486; SSE-NEXT:    pandn %xmm0, %xmm2
8487; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
8488; SSE-NEXT:    pand %xmm14, %xmm3
8489; SSE-NEXT:    por %xmm2, %xmm3
8490; SSE-NEXT:    pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8491; SSE-NEXT:    # xmm0 = mem[2,1,2,3,4,5,6,7]
8492; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8493; SSE-NEXT:    packuswb %xmm0, %xmm0
8494; SSE-NEXT:    movdqa %xmm7, %xmm2
8495; SSE-NEXT:    pandn %xmm0, %xmm2
8496; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm3[0,2,1,0,4,5,6,7]
8497; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
8498; SSE-NEXT:    packuswb %xmm0, %xmm0
8499; SSE-NEXT:    pand %xmm7, %xmm0
8500; SSE-NEXT:    por %xmm0, %xmm2
8501; SSE-NEXT:    movdqa %xmm10, %xmm0
8502; SSE-NEXT:    pandn %xmm2, %xmm0
8503; SSE-NEXT:    pand %xmm10, %xmm1
8504; SSE-NEXT:    por %xmm1, %xmm0
8505; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8506; SSE-NEXT:    movdqa %xmm13, %xmm0
8507; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8508; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8509; SSE-NEXT:    pand %xmm13, %xmm2
8510; SSE-NEXT:    por %xmm0, %xmm2
8511; SSE-NEXT:    movdqa %xmm2, %xmm0
8512; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
8513; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
8514; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
8515; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7]
8516; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
8517; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
8518; SSE-NEXT:    psrlq $48, %xmm0
8519; SSE-NEXT:    packuswb %xmm0, %xmm1
8520; SSE-NEXT:    movdqa %xmm12, %xmm0
8521; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
8522; SSE-NEXT:    pandn %xmm13, %xmm0
8523; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8524; SSE-NEXT:    pand %xmm12, %xmm2
8525; SSE-NEXT:    por %xmm0, %xmm2
8526; SSE-NEXT:    movdqa %xmm2, %xmm0
8527; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
8528; SSE-NEXT:    movdqa %xmm5, %xmm3
8529; SSE-NEXT:    pandn %xmm0, %xmm3
8530; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8531; SSE-NEXT:    pand %xmm5, %xmm2
8532; SSE-NEXT:    por %xmm3, %xmm2
8533; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[3,2,1,0,4,5,6,7]
8534; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
8535; SSE-NEXT:    packuswb %xmm0, %xmm0
8536; SSE-NEXT:    pand %xmm6, %xmm0
8537; SSE-NEXT:    pandn %xmm1, %xmm6
8538; SSE-NEXT:    por %xmm6, %xmm0
8539; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8540; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8541; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8542; SSE-NEXT:    por %xmm1, %xmm3
8543; SSE-NEXT:    movdqa %xmm3, %xmm1
8544; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
8545; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
8546; SSE-NEXT:    pand %xmm14, %xmm3
8547; SSE-NEXT:    pandn %xmm1, %xmm14
8548; SSE-NEXT:    por %xmm3, %xmm14
8549; SSE-NEXT:    pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8550; SSE-NEXT:    # xmm1 = mem[2,1,2,3,4,5,6,7]
8551; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
8552; SSE-NEXT:    packuswb %xmm1, %xmm1
8553; SSE-NEXT:    movdqa %xmm7, %xmm2
8554; SSE-NEXT:    pandn %xmm1, %xmm2
8555; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm14[0,2,1,0,4,5,6,7]
8556; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7]
8557; SSE-NEXT:    packuswb %xmm1, %xmm1
8558; SSE-NEXT:    pand %xmm7, %xmm1
8559; SSE-NEXT:    por %xmm1, %xmm2
8560; SSE-NEXT:    movdqa %xmm10, %xmm1
8561; SSE-NEXT:    pandn %xmm2, %xmm1
8562; SSE-NEXT:    pand %xmm10, %xmm0
8563; SSE-NEXT:    por %xmm0, %xmm1
8564; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8565; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,0,65535,65535]
8566; SSE-NEXT:    movdqa %xmm8, %xmm0
8567; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8568; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8569; SSE-NEXT:    pand %xmm8, %xmm1
8570; SSE-NEXT:    por %xmm0, %xmm1
8571; SSE-NEXT:    movdqa %xmm1, %xmm0
8572; SSE-NEXT:    pxor %xmm2, %xmm2
8573; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
8574; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
8575; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
8576; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
8577; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
8578; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8579; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8580; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
8581; SSE-NEXT:    pxor %xmm6, %xmm6
8582; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8583; SSE-NEXT:    pandn %xmm0, %xmm3
8584; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8585; SSE-NEXT:    por %xmm3, %xmm2
8586; SSE-NEXT:    packuswb %xmm0, %xmm2
8587; SSE-NEXT:    packuswb %xmm1, %xmm1
8588; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,3,3]
8589; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
8590; SSE-NEXT:    movdqa %xmm12, %xmm1
8591; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8592; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
8593; SSE-NEXT:    movdqa %xmm14, %xmm2
8594; SSE-NEXT:    pand %xmm12, %xmm2
8595; SSE-NEXT:    por %xmm1, %xmm2
8596; SSE-NEXT:    movdqa %xmm2, %xmm1
8597; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
8598; SSE-NEXT:    movdqa %xmm5, %xmm3
8599; SSE-NEXT:    pandn %xmm1, %xmm3
8600; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15]
8601; SSE-NEXT:    pand %xmm5, %xmm2
8602; SSE-NEXT:    por %xmm3, %xmm2
8603; SSE-NEXT:    pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8604; SSE-NEXT:    # xmm1 = mem[0,1,2,1]
8605; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
8606; SSE-NEXT:    packuswb %xmm1, %xmm1
8607; SSE-NEXT:    movdqa %xmm7, %xmm3
8608; SSE-NEXT:    pandn %xmm1, %xmm3
8609; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
8610; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
8611; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
8612; SSE-NEXT:    packuswb %xmm1, %xmm1
8613; SSE-NEXT:    pand %xmm7, %xmm1
8614; SSE-NEXT:    por %xmm1, %xmm3
8615; SSE-NEXT:    movdqa %xmm10, %xmm1
8616; SSE-NEXT:    pandn %xmm3, %xmm1
8617; SSE-NEXT:    andps %xmm10, %xmm0
8618; SSE-NEXT:    por %xmm0, %xmm1
8619; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8620; SSE-NEXT:    movdqa %xmm8, %xmm0
8621; SSE-NEXT:    pandn %xmm15, %xmm0
8622; SSE-NEXT:    pand %xmm8, %xmm4
8623; SSE-NEXT:    por %xmm0, %xmm4
8624; SSE-NEXT:    movdqa %xmm4, %xmm0
8625; SSE-NEXT:    pxor %xmm1, %xmm1
8626; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
8627; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
8628; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
8629; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[2,1,2,3]
8630; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
8631; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
8632; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8633; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
8634; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8635; SSE-NEXT:    pandn %xmm0, %xmm4
8636; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8637; SSE-NEXT:    por %xmm4, %xmm3
8638; SSE-NEXT:    packuswb %xmm0, %xmm3
8639; SSE-NEXT:    packuswb %xmm2, %xmm2
8640; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,1,3,3]
8641; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
8642; SSE-NEXT:    movdqa %xmm12, %xmm2
8643; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8644; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8645; SSE-NEXT:    movdqa %xmm15, %xmm3
8646; SSE-NEXT:    pand %xmm12, %xmm3
8647; SSE-NEXT:    por %xmm2, %xmm3
8648; SSE-NEXT:    movdqa %xmm3, %xmm2
8649; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
8650; SSE-NEXT:    movdqa %xmm5, %xmm4
8651; SSE-NEXT:    pandn %xmm2, %xmm4
8652; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
8653; SSE-NEXT:    pand %xmm5, %xmm3
8654; SSE-NEXT:    por %xmm4, %xmm3
8655; SSE-NEXT:    pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8656; SSE-NEXT:    # xmm2 = mem[0,1,2,1]
8657; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
8658; SSE-NEXT:    packuswb %xmm2, %xmm2
8659; SSE-NEXT:    movdqa %xmm7, %xmm4
8660; SSE-NEXT:    pandn %xmm2, %xmm4
8661; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3]
8662; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7]
8663; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
8664; SSE-NEXT:    packuswb %xmm2, %xmm2
8665; SSE-NEXT:    pand %xmm7, %xmm2
8666; SSE-NEXT:    por %xmm2, %xmm4
8667; SSE-NEXT:    movdqa %xmm10, %xmm1
8668; SSE-NEXT:    pandn %xmm4, %xmm1
8669; SSE-NEXT:    andps %xmm10, %xmm0
8670; SSE-NEXT:    por %xmm0, %xmm1
8671; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8672; SSE-NEXT:    movdqa %xmm8, %xmm0
8673; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8674; SSE-NEXT:    pand %xmm8, %xmm9
8675; SSE-NEXT:    por %xmm0, %xmm9
8676; SSE-NEXT:    movdqa %xmm9, %xmm0
8677; SSE-NEXT:    pxor %xmm1, %xmm1
8678; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
8679; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
8680; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
8681; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm9[2,1,2,3]
8682; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
8683; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
8684; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8685; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
8686; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8687; SSE-NEXT:    pandn %xmm0, %xmm4
8688; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8689; SSE-NEXT:    por %xmm4, %xmm2
8690; SSE-NEXT:    packuswb %xmm0, %xmm2
8691; SSE-NEXT:    packuswb %xmm3, %xmm3
8692; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,3,3]
8693; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3]
8694; SSE-NEXT:    movdqa %xmm12, %xmm3
8695; SSE-NEXT:    pandn %xmm11, %xmm3
8696; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8697; SSE-NEXT:    movdqa %xmm2, %xmm4
8698; SSE-NEXT:    pand %xmm12, %xmm4
8699; SSE-NEXT:    por %xmm3, %xmm4
8700; SSE-NEXT:    movdqa %xmm4, %xmm3
8701; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
8702; SSE-NEXT:    movdqa %xmm5, %xmm6
8703; SSE-NEXT:    pandn %xmm3, %xmm6
8704; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
8705; SSE-NEXT:    pand %xmm5, %xmm4
8706; SSE-NEXT:    por %xmm6, %xmm4
8707; SSE-NEXT:    pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8708; SSE-NEXT:    # xmm3 = mem[0,1,2,1]
8709; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
8710; SSE-NEXT:    packuswb %xmm3, %xmm3
8711; SSE-NEXT:    movdqa %xmm7, %xmm6
8712; SSE-NEXT:    pandn %xmm3, %xmm6
8713; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,1,0,3]
8714; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7]
8715; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
8716; SSE-NEXT:    packuswb %xmm3, %xmm3
8717; SSE-NEXT:    pand %xmm7, %xmm3
8718; SSE-NEXT:    por %xmm3, %xmm6
8719; SSE-NEXT:    movdqa %xmm10, %xmm1
8720; SSE-NEXT:    pandn %xmm6, %xmm1
8721; SSE-NEXT:    andps %xmm10, %xmm0
8722; SSE-NEXT:    por %xmm0, %xmm1
8723; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8724; SSE-NEXT:    movdqa %xmm8, %xmm0
8725; SSE-NEXT:    pandn %xmm13, %xmm0
8726; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8727; SSE-NEXT:    pand %xmm8, %xmm4
8728; SSE-NEXT:    por %xmm0, %xmm4
8729; SSE-NEXT:    movdqa %xmm4, %xmm0
8730; SSE-NEXT:    pxor %xmm1, %xmm1
8731; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
8732; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
8733; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
8734; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
8735; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
8736; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
8737; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8738; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
8739; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8740; SSE-NEXT:    pandn %xmm0, %xmm6
8741; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8742; SSE-NEXT:    por %xmm6, %xmm3
8743; SSE-NEXT:    packuswb %xmm0, %xmm3
8744; SSE-NEXT:    packuswb %xmm4, %xmm4
8745; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,1,3,3]
8746; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3]
8747; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8748; SSE-NEXT:    movdqa %xmm12, %xmm3
8749; SSE-NEXT:    pand %xmm12, %xmm4
8750; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8751; SSE-NEXT:    por %xmm4, %xmm3
8752; SSE-NEXT:    movdqa %xmm3, %xmm4
8753; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
8754; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
8755; SSE-NEXT:    pxor %xmm12, %xmm12
8756; SSE-NEXT:    pand %xmm5, %xmm3
8757; SSE-NEXT:    pandn %xmm4, %xmm5
8758; SSE-NEXT:    por %xmm3, %xmm5
8759; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,1,0,3]
8760; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
8761; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
8762; SSE-NEXT:    packuswb %xmm4, %xmm4
8763; SSE-NEXT:    pand %xmm7, %xmm4
8764; SSE-NEXT:    pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
8765; SSE-NEXT:    # xmm5 = mem[0,1,2,1]
8766; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
8767; SSE-NEXT:    packuswb %xmm5, %xmm5
8768; SSE-NEXT:    pandn %xmm5, %xmm7
8769; SSE-NEXT:    por %xmm4, %xmm7
8770; SSE-NEXT:    movdqa %xmm10, %xmm3
8771; SSE-NEXT:    pandn %xmm7, %xmm3
8772; SSE-NEXT:    andps %xmm10, %xmm0
8773; SSE-NEXT:    por %xmm0, %xmm3
8774; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8775; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8776; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,65535,65535,0,65535]
8777; SSE-NEXT:    pand %xmm13, %xmm4
8778; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
8779; SSE-NEXT:    movdqa %xmm4, %xmm6
8780; SSE-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm12[8],xmm6[9],xmm12[9],xmm6[10],xmm12[10],xmm6[11],xmm12[11],xmm6[12],xmm12[12],xmm6[13],xmm12[13],xmm6[14],xmm12[14],xmm6[15],xmm12[15]
8781; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0]
8782; SSE-NEXT:    movdqa %xmm0, %xmm7
8783; SSE-NEXT:    pandn %xmm6, %xmm7
8784; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
8785; SSE-NEXT:    pand %xmm0, %xmm4
8786; SSE-NEXT:    por %xmm7, %xmm4
8787; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
8788; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,5,4,7,6]
8789; SSE-NEXT:    psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8790; SSE-NEXT:    packuswb %xmm6, %xmm7
8791; SSE-NEXT:    movdqa %xmm13, %xmm3
8792; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8793; SSE-NEXT:    pandn %xmm1, %xmm3
8794; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8795; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3]
8796; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
8797; SSE-NEXT:    # xmm6 = mem[0,2,2,3]
8798; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
8799; SSE-NEXT:    movdqa %xmm6, %xmm4
8800; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm12[8],xmm4[9],xmm12[9],xmm4[10],xmm12[10],xmm4[11],xmm12[11],xmm4[12],xmm12[12],xmm4[13],xmm12[13],xmm4[14],xmm12[14],xmm4[15],xmm12[15]
8801; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
8802; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
8803; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
8804; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
8805; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
8806; SSE-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
8807; SSE-NEXT:    packuswb %xmm6, %xmm6
8808; SSE-NEXT:    movss {{.*#+}} xmm7 = xmm6[0],xmm7[1,2,3]
8809; SSE-NEXT:    movdqa %xmm8, %xmm1
8810; SSE-NEXT:    movdqa %xmm8, %xmm4
8811; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
8812; SSE-NEXT:    movdqa %xmm14, %xmm8
8813; SSE-NEXT:    pand %xmm1, %xmm8
8814; SSE-NEXT:    movdqa %xmm1, %xmm14
8815; SSE-NEXT:    por %xmm4, %xmm8
8816; SSE-NEXT:    movdqa %xmm8, %xmm4
8817; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
8818; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,0,65535,65535,65535]
8819; SSE-NEXT:    movdqa %xmm1, %xmm6
8820; SSE-NEXT:    pandn %xmm4, %xmm6
8821; SSE-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15]
8822; SSE-NEXT:    pand %xmm1, %xmm8
8823; SSE-NEXT:    por %xmm6, %xmm8
8824; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8825; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8826; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,1,0,3]
8827; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,6]
8828; SSE-NEXT:    packuswb %xmm4, %xmm4
8829; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0]
8830; SSE-NEXT:    movdqa %xmm6, %xmm9
8831; SSE-NEXT:    pandn %xmm4, %xmm9
8832; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm8[2,1,0,3]
8833; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,0,3,2,4,5,6,7]
8834; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
8835; SSE-NEXT:    packuswb %xmm4, %xmm4
8836; SSE-NEXT:    pand %xmm6, %xmm4
8837; SSE-NEXT:    por %xmm4, %xmm9
8838; SSE-NEXT:    movdqa %xmm10, %xmm3
8839; SSE-NEXT:    pandn %xmm9, %xmm3
8840; SSE-NEXT:    andps %xmm10, %xmm7
8841; SSE-NEXT:    movdqa %xmm10, %xmm5
8842; SSE-NEXT:    por %xmm7, %xmm3
8843; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8844; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8845; SSE-NEXT:    movdqa %xmm13, %xmm10
8846; SSE-NEXT:    pand %xmm13, %xmm7
8847; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8848; SSE-NEXT:    movdqa %xmm7, %xmm8
8849; SSE-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15]
8850; SSE-NEXT:    movdqa %xmm0, %xmm9
8851; SSE-NEXT:    pandn %xmm8, %xmm9
8852; SSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7]
8853; SSE-NEXT:    pand %xmm0, %xmm7
8854; SSE-NEXT:    por %xmm9, %xmm7
8855; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
8856; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,5,4,7,6]
8857; SSE-NEXT:    psrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8858; SSE-NEXT:    packuswb %xmm8, %xmm9
8859; SSE-NEXT:    movdqa %xmm13, %xmm4
8860; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8861; SSE-NEXT:    pandn %xmm3, %xmm4
8862; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8863; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,3,2,3]
8864; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8865; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[0,2,2,3]
8866; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
8867; SSE-NEXT:    movdqa %xmm8, %xmm7
8868; SSE-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm12[8],xmm7[9],xmm12[9],xmm7[10],xmm12[10],xmm7[11],xmm12[11],xmm7[12],xmm12[12],xmm7[13],xmm12[13],xmm7[14],xmm12[14],xmm7[15],xmm12[15]
8869; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
8870; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7]
8871; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
8872; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
8873; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
8874; SSE-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
8875; SSE-NEXT:    packuswb %xmm8, %xmm8
8876; SSE-NEXT:    movss {{.*#+}} xmm9 = xmm8[0],xmm9[1,2,3]
8877; SSE-NEXT:    movdqa %xmm14, %xmm7
8878; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8879; SSE-NEXT:    movdqa %xmm15, %xmm8
8880; SSE-NEXT:    pand %xmm14, %xmm8
8881; SSE-NEXT:    por %xmm7, %xmm8
8882; SSE-NEXT:    movdqa %xmm8, %xmm7
8883; SSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7]
8884; SSE-NEXT:    movdqa %xmm1, %xmm13
8885; SSE-NEXT:    pandn %xmm7, %xmm13
8886; SSE-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15]
8887; SSE-NEXT:    pand %xmm1, %xmm8
8888; SSE-NEXT:    por %xmm13, %xmm8
8889; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8890; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8891; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[0,1,0,3]
8892; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6]
8893; SSE-NEXT:    packuswb %xmm7, %xmm7
8894; SSE-NEXT:    movdqa %xmm6, %xmm13
8895; SSE-NEXT:    pandn %xmm7, %xmm13
8896; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm8[2,1,0,3]
8897; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,0,3,2,4,5,6,7]
8898; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
8899; SSE-NEXT:    packuswb %xmm7, %xmm7
8900; SSE-NEXT:    pand %xmm6, %xmm7
8901; SSE-NEXT:    por %xmm7, %xmm13
8902; SSE-NEXT:    movdqa %xmm5, %xmm7
8903; SSE-NEXT:    pandn %xmm13, %xmm7
8904; SSE-NEXT:    andps %xmm5, %xmm9
8905; SSE-NEXT:    por %xmm9, %xmm7
8906; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8907; SSE-NEXT:    pand %xmm10, %xmm8
8908; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
8909; SSE-NEXT:    movdqa %xmm8, %xmm9
8910; SSE-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15]
8911; SSE-NEXT:    movdqa %xmm0, %xmm13
8912; SSE-NEXT:    pandn %xmm9, %xmm13
8913; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
8914; SSE-NEXT:    pand %xmm0, %xmm8
8915; SSE-NEXT:    por %xmm13, %xmm8
8916; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
8917; SSE-NEXT:    pshufhw {{.*#+}} xmm15 = xmm8[0,1,2,3,5,4,7,6]
8918; SSE-NEXT:    psrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8919; SSE-NEXT:    packuswb %xmm9, %xmm15
8920; SSE-NEXT:    movdqa %xmm10, %xmm13
8921; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8922; SSE-NEXT:    pandn %xmm3, %xmm13
8923; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[1,3,2,3]
8924; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8925; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm3[0,2,2,3]
8926; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
8927; SSE-NEXT:    movdqa %xmm9, %xmm8
8928; SSE-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15]
8929; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
8930; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
8931; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
8932; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[2,1,2,3]
8933; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
8934; SSE-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
8935; SSE-NEXT:    packuswb %xmm9, %xmm9
8936; SSE-NEXT:    movss {{.*#+}} xmm15 = xmm9[0],xmm15[1,2,3]
8937; SSE-NEXT:    movdqa %xmm14, %xmm8
8938; SSE-NEXT:    pandn %xmm11, %xmm8
8939; SSE-NEXT:    movdqa %xmm2, %xmm9
8940; SSE-NEXT:    pand %xmm14, %xmm9
8941; SSE-NEXT:    por %xmm8, %xmm9
8942; SSE-NEXT:    movdqa %xmm9, %xmm8
8943; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
8944; SSE-NEXT:    movdqa %xmm1, %xmm11
8945; SSE-NEXT:    pandn %xmm8, %xmm11
8946; SSE-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15]
8947; SSE-NEXT:    pand %xmm1, %xmm9
8948; SSE-NEXT:    por %xmm11, %xmm9
8949; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8950; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8951; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[0,1,0,3]
8952; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6]
8953; SSE-NEXT:    packuswb %xmm8, %xmm8
8954; SSE-NEXT:    movdqa %xmm6, %xmm11
8955; SSE-NEXT:    pandn %xmm8, %xmm11
8956; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm9[2,1,0,3]
8957; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[0,0,3,2,4,5,6,7]
8958; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
8959; SSE-NEXT:    packuswb %xmm8, %xmm8
8960; SSE-NEXT:    pand %xmm6, %xmm8
8961; SSE-NEXT:    por %xmm8, %xmm11
8962; SSE-NEXT:    movdqa %xmm5, %xmm9
8963; SSE-NEXT:    pandn %xmm11, %xmm9
8964; SSE-NEXT:    andps %xmm5, %xmm15
8965; SSE-NEXT:    por %xmm15, %xmm9
8966; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8967; SSE-NEXT:    pand %xmm10, %xmm8
8968; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
8969; SSE-NEXT:    movdqa %xmm8, %xmm11
8970; SSE-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15]
8971; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
8972; SSE-NEXT:    pand %xmm0, %xmm8
8973; SSE-NEXT:    pandn %xmm11, %xmm0
8974; SSE-NEXT:    por %xmm8, %xmm0
8975; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
8976; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
8977; SSE-NEXT:    psrldq {{.*#+}} xmm11 = xmm11[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8978; SSE-NEXT:    packuswb %xmm11, %xmm0
8979; SSE-NEXT:    movdqa %xmm10, %xmm2
8980; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8981; SSE-NEXT:    pand %xmm10, %xmm15
8982; SSE-NEXT:    pand %xmm10, %xmm4
8983; SSE-NEXT:    pand %xmm10, %xmm3
8984; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8985; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8986; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[0,2,2,3]
8987; SSE-NEXT:    pand %xmm10, %xmm3
8988; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8989; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8990; SSE-NEXT:    pandn %xmm3, %xmm2
8991; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm3[1,3,2,3]
8992; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1]
8993; SSE-NEXT:    movdqa %xmm8, %xmm11
8994; SSE-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15]
8995; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[2,1,2,3]
8996; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7]
8997; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
8998; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
8999; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
9000; SSE-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3]
9001; SSE-NEXT:    packuswb %xmm8, %xmm8
9002; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm8[0],xmm0[1,2,3]
9003; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9004; SSE-NEXT:    movdqa %xmm14, %xmm3
9005; SSE-NEXT:    pand %xmm14, %xmm8
9006; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9007; SSE-NEXT:    por %xmm8, %xmm3
9008; SSE-NEXT:    movdqa %xmm3, %xmm8
9009; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
9010; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15]
9011; SSE-NEXT:    pand %xmm1, %xmm3
9012; SSE-NEXT:    pandn %xmm8, %xmm1
9013; SSE-NEXT:    por %xmm3, %xmm1
9014; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9015; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9016; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[0,1,0,3]
9017; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6]
9018; SSE-NEXT:    packuswb %xmm8, %xmm8
9019; SSE-NEXT:    movdqa %xmm6, %xmm14
9020; SSE-NEXT:    pandn %xmm8, %xmm14
9021; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
9022; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7]
9023; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
9024; SSE-NEXT:    packuswb %xmm1, %xmm1
9025; SSE-NEXT:    pand %xmm6, %xmm1
9026; SSE-NEXT:    por %xmm1, %xmm14
9027; SSE-NEXT:    movdqa %xmm5, %xmm11
9028; SSE-NEXT:    pandn %xmm14, %xmm11
9029; SSE-NEXT:    andps %xmm5, %xmm0
9030; SSE-NEXT:    por %xmm0, %xmm11
9031; SSE-NEXT:    movdqa %xmm15, %xmm1
9032; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9033; SSE-NEXT:    movdqa %xmm1, %xmm0
9034; SSE-NEXT:    pxor %xmm3, %xmm3
9035; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
9036; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
9037; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
9038; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
9039; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7]
9040; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
9041; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9042; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
9043; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9044; SSE-NEXT:    pandn %xmm1, %xmm10
9045; SSE-NEXT:    movdqa %xmm1, %xmm8
9046; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9047; SSE-NEXT:    por %xmm10, %xmm1
9048; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
9049; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
9050; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
9051; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
9052; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[3,3,3,3]
9053; SSE-NEXT:    packuswb %xmm8, %xmm1
9054; SSE-NEXT:    packuswb %xmm0, %xmm0
9055; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
9056; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9057; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9058; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
9059; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
9060; SSE-NEXT:    packuswb %xmm0, %xmm0
9061; SSE-NEXT:    movdqa %xmm6, %xmm8
9062; SSE-NEXT:    pandn %xmm0, %xmm8
9063; SSE-NEXT:    pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9064; SSE-NEXT:    # xmm0 = mem[1,3,2,3]
9065; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9066; SSE-NEXT:    # xmm14 = mem[0,2,2,3]
9067; SSE-NEXT:    punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
9068; SSE-NEXT:    movdqa %xmm14, %xmm0
9069; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
9070; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,0,65535,65535]
9071; SSE-NEXT:    movdqa %xmm10, %xmm15
9072; SSE-NEXT:    pandn %xmm0, %xmm15
9073; SSE-NEXT:    punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm3[8],xmm14[9],xmm3[9],xmm14[10],xmm3[10],xmm14[11],xmm3[11],xmm14[12],xmm3[12],xmm14[13],xmm3[13],xmm14[14],xmm3[14],xmm14[15],xmm3[15]
9074; SSE-NEXT:    pand %xmm10, %xmm14
9075; SSE-NEXT:    por %xmm15, %xmm14
9076; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[2,1,1,1]
9077; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
9078; SSE-NEXT:    packuswb %xmm0, %xmm14
9079; SSE-NEXT:    pand %xmm6, %xmm14
9080; SSE-NEXT:    por %xmm8, %xmm14
9081; SSE-NEXT:    movdqa %xmm5, %xmm3
9082; SSE-NEXT:    pandn %xmm14, %xmm3
9083; SSE-NEXT:    andps %xmm5, %xmm1
9084; SSE-NEXT:    por %xmm1, %xmm3
9085; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9086; SSE-NEXT:    movdqa %xmm4, %xmm1
9087; SSE-NEXT:    pxor %xmm0, %xmm0
9088; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
9089; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
9090; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
9091; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[0,1,2,1]
9092; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7]
9093; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
9094; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9095; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
9096; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9097; SSE-NEXT:    pandn %xmm4, %xmm12
9098; SSE-NEXT:    movdqa (%rsp), %xmm8 # 16-byte Reload
9099; SSE-NEXT:    por %xmm12, %xmm8
9100; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,1,3]
9101; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7]
9102; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0]
9103; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5]
9104; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm4[3,3,3,3]
9105; SSE-NEXT:    packuswb %xmm12, %xmm8
9106; SSE-NEXT:    packuswb %xmm1, %xmm1
9107; SSE-NEXT:    movss {{.*#+}} xmm8 = xmm1[0],xmm8[1,2,3]
9108; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9109; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9110; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
9111; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
9112; SSE-NEXT:    packuswb %xmm1, %xmm1
9113; SSE-NEXT:    movdqa %xmm6, %xmm12
9114; SSE-NEXT:    pandn %xmm1, %xmm12
9115; SSE-NEXT:    pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9116; SSE-NEXT:    # xmm1 = mem[1,3,2,3]
9117; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9118; SSE-NEXT:    # xmm14 = mem[0,2,2,3]
9119; SSE-NEXT:    punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
9120; SSE-NEXT:    movdqa %xmm14, %xmm1
9121; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
9122; SSE-NEXT:    movdqa %xmm10, %xmm15
9123; SSE-NEXT:    pandn %xmm1, %xmm15
9124; SSE-NEXT:    punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm0[8],xmm14[9],xmm0[9],xmm14[10],xmm0[10],xmm14[11],xmm0[11],xmm14[12],xmm0[12],xmm14[13],xmm0[13],xmm14[14],xmm0[14],xmm14[15],xmm0[15]
9125; SSE-NEXT:    pand %xmm10, %xmm14
9126; SSE-NEXT:    por %xmm15, %xmm14
9127; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm14[2,1,1,1]
9128; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
9129; SSE-NEXT:    packuswb %xmm1, %xmm1
9130; SSE-NEXT:    pand %xmm6, %xmm1
9131; SSE-NEXT:    por %xmm12, %xmm1
9132; SSE-NEXT:    movdqa %xmm5, %xmm12
9133; SSE-NEXT:    pandn %xmm1, %xmm12
9134; SSE-NEXT:    andps %xmm5, %xmm8
9135; SSE-NEXT:    movdqa %xmm5, %xmm4
9136; SSE-NEXT:    por %xmm8, %xmm12
9137; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9138; SSE-NEXT:    por %xmm13, %xmm0
9139; SSE-NEXT:    movdqa %xmm0, %xmm1
9140; SSE-NEXT:    pxor %xmm13, %xmm13
9141; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7]
9142; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
9143; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15]
9144; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[0,1,2,1]
9145; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7]
9146; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
9147; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9148; SSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15]
9149; SSE-NEXT:    pxor %xmm0, %xmm0
9150; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9151; SSE-NEXT:    pandn %xmm5, %xmm13
9152; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9153; SSE-NEXT:    por %xmm13, %xmm8
9154; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,1,3]
9155; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7]
9156; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0]
9157; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5]
9158; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm5[3,3,3,3]
9159; SSE-NEXT:    packuswb %xmm13, %xmm8
9160; SSE-NEXT:    packuswb %xmm1, %xmm1
9161; SSE-NEXT:    movss {{.*#+}} xmm8 = xmm1[0],xmm8[1,2,3]
9162; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9163; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9164; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
9165; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
9166; SSE-NEXT:    packuswb %xmm1, %xmm1
9167; SSE-NEXT:    movdqa %xmm6, %xmm13
9168; SSE-NEXT:    pandn %xmm1, %xmm13
9169; SSE-NEXT:    pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9170; SSE-NEXT:    # xmm1 = mem[1,3,2,3]
9171; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9172; SSE-NEXT:    # xmm14 = mem[0,2,2,3]
9173; SSE-NEXT:    punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
9174; SSE-NEXT:    movdqa %xmm14, %xmm1
9175; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
9176; SSE-NEXT:    movdqa %xmm10, %xmm15
9177; SSE-NEXT:    pandn %xmm1, %xmm15
9178; SSE-NEXT:    punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm0[8],xmm14[9],xmm0[9],xmm14[10],xmm0[10],xmm14[11],xmm0[11],xmm14[12],xmm0[12],xmm14[13],xmm0[13],xmm14[14],xmm0[14],xmm14[15],xmm0[15]
9179; SSE-NEXT:    pand %xmm10, %xmm14
9180; SSE-NEXT:    por %xmm15, %xmm14
9181; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm14[2,1,1,1]
9182; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
9183; SSE-NEXT:    packuswb %xmm1, %xmm1
9184; SSE-NEXT:    pand %xmm6, %xmm1
9185; SSE-NEXT:    por %xmm13, %xmm1
9186; SSE-NEXT:    movdqa %xmm4, %xmm0
9187; SSE-NEXT:    movdqa %xmm4, %xmm13
9188; SSE-NEXT:    pandn %xmm1, %xmm13
9189; SSE-NEXT:    andps %xmm4, %xmm8
9190; SSE-NEXT:    por %xmm8, %xmm13
9191; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9192; SSE-NEXT:    movdqa %xmm2, %xmm1
9193; SSE-NEXT:    pxor %xmm14, %xmm14
9194; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
9195; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
9196; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15]
9197; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[0,1,2,1]
9198; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7]
9199; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
9200; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9201; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15]
9202; SSE-NEXT:    pxor %xmm15, %xmm15
9203; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9204; SSE-NEXT:    pandn %xmm2, %xmm5
9205; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9206; SSE-NEXT:    por %xmm5, %xmm4
9207; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[0,2,1,3]
9208; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7]
9209; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0]
9210; SSE-NEXT:    pshufhw {{.*#+}} xmm14 = xmm8[0,1,2,3,4,7,6,5]
9211; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[3,3,3,3]
9212; SSE-NEXT:    packuswb %xmm8, %xmm14
9213; SSE-NEXT:    packuswb %xmm1, %xmm1
9214; SSE-NEXT:    movss {{.*#+}} xmm14 = xmm1[0],xmm14[1,2,3]
9215; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9216; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9217; SSE-NEXT:    pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9218; SSE-NEXT:    # xmm1 = mem[1,3,2,3]
9219; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
9220; SSE-NEXT:    # xmm8 = mem[0,2,2,3]
9221; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
9222; SSE-NEXT:    movdqa %xmm8, %xmm1
9223; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
9224; SSE-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm15[8],xmm8[9],xmm15[9],xmm8[10],xmm15[10],xmm8[11],xmm15[11],xmm8[12],xmm15[12],xmm8[13],xmm15[13],xmm8[14],xmm15[14],xmm8[15],xmm15[15]
9225; SSE-NEXT:    pand %xmm10, %xmm8
9226; SSE-NEXT:    pandn %xmm1, %xmm10
9227; SSE-NEXT:    por %xmm8, %xmm10
9228; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm10[2,1,1,1]
9229; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
9230; SSE-NEXT:    packuswb %xmm1, %xmm1
9231; SSE-NEXT:    pand %xmm6, %xmm1
9232; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[0,1,0,3]
9233; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7]
9234; SSE-NEXT:    packuswb %xmm8, %xmm8
9235; SSE-NEXT:    pandn %xmm8, %xmm6
9236; SSE-NEXT:    por %xmm6, %xmm1
9237; SSE-NEXT:    andps %xmm0, %xmm14
9238; SSE-NEXT:    pandn %xmm1, %xmm0
9239; SSE-NEXT:    por %xmm14, %xmm0
9240; SSE-NEXT:    movdqa %xmm0, %xmm1
9241; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9242; SSE-NEXT:    movaps %xmm0, (%rsi)
9243; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9244; SSE-NEXT:    movaps %xmm0, 48(%rsi)
9245; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9246; SSE-NEXT:    movaps %xmm0, 32(%rsi)
9247; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9248; SSE-NEXT:    movaps %xmm0, 16(%rsi)
9249; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9250; SSE-NEXT:    movaps %xmm0, (%rdx)
9251; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9252; SSE-NEXT:    movaps %xmm0, 48(%rdx)
9253; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9254; SSE-NEXT:    movaps %xmm0, 32(%rdx)
9255; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9256; SSE-NEXT:    movaps %xmm0, 16(%rdx)
9257; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9258; SSE-NEXT:    movaps %xmm0, (%rcx)
9259; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9260; SSE-NEXT:    movaps %xmm0, 48(%rcx)
9261; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9262; SSE-NEXT:    movaps %xmm0, 32(%rcx)
9263; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9264; SSE-NEXT:    movaps %xmm0, 16(%rcx)
9265; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9266; SSE-NEXT:    movaps %xmm0, (%r8)
9267; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9268; SSE-NEXT:    movaps %xmm0, 48(%r8)
9269; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9270; SSE-NEXT:    movaps %xmm0, 32(%r8)
9271; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9272; SSE-NEXT:    movaps %xmm0, 16(%r8)
9273; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9274; SSE-NEXT:    movaps %xmm0, (%r9)
9275; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9276; SSE-NEXT:    movaps %xmm0, 48(%r9)
9277; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9278; SSE-NEXT:    movaps %xmm0, 32(%r9)
9279; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9280; SSE-NEXT:    movaps %xmm0, 16(%r9)
9281; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
9282; SSE-NEXT:    movdqa %xmm11, (%rax)
9283; SSE-NEXT:    movdqa %xmm9, 48(%rax)
9284; SSE-NEXT:    movdqa %xmm7, 32(%rax)
9285; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9286; SSE-NEXT:    movaps %xmm0, 16(%rax)
9287; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
9288; SSE-NEXT:    movdqa %xmm1, (%rax)
9289; SSE-NEXT:    movdqa %xmm13, 48(%rax)
9290; SSE-NEXT:    movdqa %xmm12, 32(%rax)
9291; SSE-NEXT:    movdqa %xmm3, 16(%rax)
9292; SSE-NEXT:    addq $1528, %rsp # imm = 0x5F8
9293; SSE-NEXT:    retq
9294;
9295; AVX-LABEL: load_i8_stride7_vf64:
9296; AVX:       # %bb.0:
9297; AVX-NEXT:    subq $744, %rsp # imm = 0x2E8
9298; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = [128,128,6,13,0,0,0,128,128,128,6,13,0,0,0,128]
9299; AVX-NEXT:    # xmm0 = mem[0,0]
9300; AVX-NEXT:    vmovdqa 16(%rdi), %xmm3
9301; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9302; AVX-NEXT:    vmovdqa 176(%rdi), %xmm10
9303; AVX-NEXT:    vpshufb %xmm0, %xmm10, %xmm1
9304; AVX-NEXT:    vmovdqa 48(%rdi), %xmm8
9305; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9306; AVX-NEXT:    vmovq {{.*#+}} xmm2 = [128,128,128,5,12,0,0,0,0,0,0,0,0,0,0,0]
9307; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm4
9308; AVX-NEXT:    vmovq {{.*#+}} xmm3 = [0,7,14,128,128,0,0,0,0,0,0,0,0,0,0,0]
9309; AVX-NEXT:    vmovdqa (%rdi), %xmm5
9310; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9311; AVX-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
9312; AVX-NEXT:    vpor %xmm4, %xmm5, %xmm7
9313; AVX-NEXT:    vmovddup {{.*#+}} xmm4 = [128,128,0,0,0,3,10,128,128,128,0,0,0,3,10,128]
9314; AVX-NEXT:    # xmm4 = mem[0,0]
9315; AVX-NEXT:    vmovdqa 32(%rdi), %xmm5
9316; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9317; AVX-NEXT:    vpshufb %xmm4, %xmm5, %xmm6
9318; AVX-NEXT:    vmovddup {{.*#+}} xmm5 = [8,15,0,0,0,128,128,1,8,15,0,0,0,128,128,1]
9319; AVX-NEXT:    # xmm5 = mem[0,0]
9320; AVX-NEXT:    vpshufb %xmm5, %xmm8, %xmm8
9321; AVX-NEXT:    vpor %xmm6, %xmm8, %xmm8
9322; AVX-NEXT:    vmovq {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
9323; AVX-NEXT:    vpblendvb %xmm6, %xmm7, %xmm8, %xmm7
9324; AVX-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9325; AVX-NEXT:    vmovdqa 240(%rdi), %xmm7
9326; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9327; AVX-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
9328; AVX-NEXT:    vmovdqa 224(%rdi), %xmm7
9329; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9330; AVX-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
9331; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
9332; AVX-NEXT:    vmovdqa 256(%rdi), %xmm3
9333; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9334; AVX-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
9335; AVX-NEXT:    vmovdqa 272(%rdi), %xmm4
9336; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9337; AVX-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
9338; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
9339; AVX-NEXT:    vmovddup {{.*#+}} xmm4 = [8,15,128,128,0,0,0,1,8,15,128,128,0,0,0,1]
9340; AVX-NEXT:    # xmm4 = mem[0,0]
9341; AVX-NEXT:    vpblendvb %xmm6, %xmm2, %xmm3, %xmm2
9342; AVX-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9343; AVX-NEXT:    vmovdqa 160(%rdi), %xmm7
9344; AVX-NEXT:    vpshufb %xmm4, %xmm7, %xmm2
9345; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm2
9346; AVX-NEXT:    vbroadcastss {{.*#+}} xmm3 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128]
9347; AVX-NEXT:    vmovdqa 144(%rdi), %xmm12
9348; AVX-NEXT:    vpshufb %xmm3, %xmm12, %xmm1
9349; AVX-NEXT:    vbroadcastss {{.*#+}} xmm5 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5]
9350; AVX-NEXT:    vmovdqa 128(%rdi), %xmm14
9351; AVX-NEXT:    vpshufb %xmm5, %xmm14, %xmm9
9352; AVX-NEXT:    vpor %xmm1, %xmm9, %xmm9
9353; AVX-NEXT:    vmovdqa {{.*#+}} xmm15 = [u,u,u,0,0,0,0,255,255,255,255,255,u,u,u,u]
9354; AVX-NEXT:    vpblendvb %xmm15, %xmm2, %xmm9, %xmm2
9355; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9356; AVX-NEXT:    vmovdqa 400(%rdi), %xmm9
9357; AVX-NEXT:    vpshufb %xmm0, %xmm9, %xmm0
9358; AVX-NEXT:    vmovdqa 384(%rdi), %xmm6
9359; AVX-NEXT:    vpshufb %xmm4, %xmm6, %xmm2
9360; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
9361; AVX-NEXT:    vmovdqa 368(%rdi), %xmm8
9362; AVX-NEXT:    vpshufb %xmm3, %xmm8, %xmm2
9363; AVX-NEXT:    vmovdqa 352(%rdi), %xmm11
9364; AVX-NEXT:    vpshufb %xmm5, %xmm11, %xmm3
9365; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
9366; AVX-NEXT:    vpblendvb %xmm15, %xmm0, %xmm2, %xmm0
9367; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9368; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = [9,128,128,128,0,0,0,2,9,128,128,128,0,0,0,2]
9369; AVX-NEXT:    # xmm0 = mem[0,0]
9370; AVX-NEXT:    vpshufb %xmm0, %xmm7, %xmm3
9371; AVX-NEXT:    vmovdqa %xmm7, %xmm1
9372; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9373; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = [128,0,7,14,0,0,0,128,128,0,7,14,0,0,0,128]
9374; AVX-NEXT:    # xmm2 = mem[0,0]
9375; AVX-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9376; AVX-NEXT:    vpshufb %xmm2, %xmm10, %xmm4
9377; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm5
9378; AVX-NEXT:    vbroadcastss {{.*#+}} xmm3 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128]
9379; AVX-NEXT:    vpshufb %xmm3, %xmm12, %xmm13
9380; AVX-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9381; AVX-NEXT:    vbroadcastss {{.*#+}} xmm4 = [13,128,128,6,13,128,128,6,13,128,128,6,13,128,128,6]
9382; AVX-NEXT:    vmovdqa %xmm14, %xmm7
9383; AVX-NEXT:    vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9384; AVX-NEXT:    vpshufb %xmm4, %xmm14, %xmm14
9385; AVX-NEXT:    vpor %xmm13, %xmm14, %xmm13
9386; AVX-NEXT:    vpblendvb %xmm15, %xmm5, %xmm13, %xmm5
9387; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9388; AVX-NEXT:    vmovdqa %xmm6, (%rsp) # 16-byte Spill
9389; AVX-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
9390; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9391; AVX-NEXT:    vpshufb %xmm2, %xmm9, %xmm2
9392; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
9393; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9394; AVX-NEXT:    vpshufb %xmm3, %xmm8, %xmm2
9395; AVX-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9396; AVX-NEXT:    vpshufb %xmm4, %xmm11, %xmm3
9397; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
9398; AVX-NEXT:    vpblendvb %xmm15, %xmm0, %xmm2, %xmm0
9399; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9400; AVX-NEXT:    vmovq {{.*#+}} xmm0 = [0,0,128,128,128,5,12,0,0,0,0,0,0,0,0,0]
9401; AVX-NEXT:    vpshufb %xmm0, %xmm12, %xmm2
9402; AVX-NEXT:    vmovq {{.*#+}} xmm12 = [0,0,0,7,14,128,128,0,0,0,0,0,0,0,0,0]
9403; AVX-NEXT:    vpshufb %xmm12, %xmm7, %xmm3
9404; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm5
9405; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = [10,128,128,128,0,0,0,3,10,128,128,128,0,0,0,3]
9406; AVX-NEXT:    # xmm2 = mem[0,0]
9407; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm4
9408; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = [128,1,8,15,0,0,0,128,128,1,8,15,0,0,0,128]
9409; AVX-NEXT:    # xmm3 = mem[0,0]
9410; AVX-NEXT:    vpshufb %xmm3, %xmm10, %xmm13
9411; AVX-NEXT:    vpor %xmm4, %xmm13, %xmm13
9412; AVX-NEXT:    vmovq {{.*#+}} xmm4 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
9413; AVX-NEXT:    vpblendvb %xmm4, %xmm5, %xmm13, %xmm5
9414; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9415; AVX-NEXT:    vpshufb %xmm0, %xmm8, %xmm0
9416; AVX-NEXT:    vpshufb %xmm12, %xmm11, %xmm1
9417; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
9418; AVX-NEXT:    vpshufb %xmm2, %xmm6, %xmm1
9419; AVX-NEXT:    vpshufb %xmm3, %xmm9, %xmm2
9420; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
9421; AVX-NEXT:    vpblendvb %xmm4, %xmm0, %xmm1, %xmm0
9422; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9423; AVX-NEXT:    vmovq {{.*#+}} xmm14 = [128,128,128,6,13,0,0,0,0,0,0,0,0,0,0,0]
9424; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9425; AVX-NEXT:    vpshufb %xmm14, %xmm11, %xmm0
9426; AVX-NEXT:    vmovq {{.*#+}} xmm2 = [1,8,15,128,128,0,0,0,0,0,0,0,0,0,0,0]
9427; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9428; AVX-NEXT:    vpshufb %xmm2, %xmm10, %xmm3
9429; AVX-NEXT:    vpor %xmm0, %xmm3, %xmm3
9430; AVX-NEXT:    vbroadcastss {{.*#+}} xmm4 = [9,128,128,2,9,128,128,2,9,128,128,2,9,128,128,2]
9431; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9432; AVX-NEXT:    vpshufb %xmm4, %xmm15, %xmm0
9433; AVX-NEXT:    vbroadcastss {{.*#+}} xmm5 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128]
9434; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9435; AVX-NEXT:    vpshufb %xmm5, %xmm7, %xmm13
9436; AVX-NEXT:    vpor %xmm0, %xmm13, %xmm13
9437; AVX-NEXT:    vmovq {{.*#+}} xmm12 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
9438; AVX-NEXT:    vpblendvb %xmm12, %xmm3, %xmm13, %xmm0
9439; AVX-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9440; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9441; AVX-NEXT:    vpshufb %xmm14, %xmm1, %xmm0
9442; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9443; AVX-NEXT:    vpshufb %xmm2, %xmm13, %xmm2
9444; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm14
9445; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9446; AVX-NEXT:    vpshufb %xmm4, %xmm8, %xmm2
9447; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9448; AVX-NEXT:    vpshufb %xmm5, %xmm0, %xmm3
9449; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
9450; AVX-NEXT:    vpblendvb %xmm12, %xmm14, %xmm2, %xmm2
9451; AVX-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9452; AVX-NEXT:    vmovq {{.*#+}} xmm9 = [2,9,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
9453; AVX-NEXT:    vpshufb %xmm9, %xmm10, %xmm2
9454; AVX-NEXT:    vmovq {{.*#+}} xmm3 = [128,128,0,7,14,0,0,0,0,0,0,0,0,0,0,0]
9455; AVX-NEXT:    vpshufb %xmm3, %xmm11, %xmm4
9456; AVX-NEXT:    vpor %xmm2, %xmm4, %xmm2
9457; AVX-NEXT:    vbroadcastss {{.*#+}} xmm4 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3]
9458; AVX-NEXT:    vpshufb %xmm4, %xmm15, %xmm5
9459; AVX-NEXT:    vbroadcastss {{.*#+}} xmm6 = [128,5,12,128,128,5,12,128,128,5,12,128,128,5,12,128]
9460; AVX-NEXT:    vpshufb %xmm6, %xmm7, %xmm14
9461; AVX-NEXT:    vpor %xmm5, %xmm14, %xmm5
9462; AVX-NEXT:    vmovdqa %xmm12, %xmm14
9463; AVX-NEXT:    vpblendvb %xmm12, %xmm2, %xmm5, %xmm2
9464; AVX-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9465; AVX-NEXT:    vpshufb %xmm9, %xmm13, %xmm5
9466; AVX-NEXT:    vmovdqa %xmm13, %xmm9
9467; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm2
9468; AVX-NEXT:    vmovdqa %xmm1, %xmm12
9469; AVX-NEXT:    vpor %xmm5, %xmm2, %xmm1
9470; AVX-NEXT:    vpshufb %xmm4, %xmm8, %xmm2
9471; AVX-NEXT:    vpshufb %xmm6, %xmm0, %xmm3
9472; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
9473; AVX-NEXT:    vpblendvb %xmm14, %xmm1, %xmm2, %xmm1
9474; AVX-NEXT:    vmovdqa %xmm14, %xmm6
9475; AVX-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9476; AVX-NEXT:    vmovq {{.*#+}} xmm1 = [3,10,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
9477; AVX-NEXT:    vpshufb %xmm1, %xmm10, %xmm3
9478; AVX-NEXT:    vmovq {{.*#+}} xmm2 = [128,128,1,8,15,0,0,0,0,0,0,0,0,0,0,0]
9479; AVX-NEXT:    vpshufb %xmm2, %xmm11, %xmm4
9480; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm5
9481; AVX-NEXT:    vbroadcastss {{.*#+}} xmm3 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4]
9482; AVX-NEXT:    vpshufb %xmm3, %xmm15, %xmm13
9483; AVX-NEXT:    vbroadcastss {{.*#+}} xmm4 = [128,6,13,128,128,6,13,128,128,6,13,128,128,6,13,128]
9484; AVX-NEXT:    vpshufb %xmm4, %xmm7, %xmm14
9485; AVX-NEXT:    vpor %xmm13, %xmm14, %xmm13
9486; AVX-NEXT:    vpblendvb %xmm6, %xmm5, %xmm13, %xmm5
9487; AVX-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9488; AVX-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
9489; AVX-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
9490; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
9491; AVX-NEXT:    vpshufb %xmm3, %xmm8, %xmm2
9492; AVX-NEXT:    vpshufb %xmm4, %xmm0, %xmm3
9493; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
9494; AVX-NEXT:    vpblendvb %xmm6, %xmm1, %xmm2, %xmm0
9495; AVX-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9496; AVX-NEXT:    vmovq {{.*#+}} xmm0 = [0,0,128,128,128,6,13,0,0,0,0,0,0,0,0,0]
9497; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9498; AVX-NEXT:    vpshufb %xmm0, %xmm11, %xmm1
9499; AVX-NEXT:    vmovq {{.*#+}} xmm3 = [0,0,1,8,15,128,128,0,0,0,0,0,0,0,0,0]
9500; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9501; AVX-NEXT:    vpshufb %xmm3, %xmm15, %xmm2
9502; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
9503; AVX-NEXT:    vbroadcastss {{.*#+}} xmm4 = [128,2,9,128,128,2,9,128,128,2,9,128,128,2,9,128]
9504; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9505; AVX-NEXT:    vpshufb %xmm4, %xmm8, %xmm2
9506; AVX-NEXT:    vbroadcastss {{.*#+}} xmm5 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4]
9507; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9508; AVX-NEXT:    vpshufb %xmm5, %xmm12, %xmm13
9509; AVX-NEXT:    vpor %xmm2, %xmm13, %xmm13
9510; AVX-NEXT:    vmovq {{.*#+}} xmm2 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
9511; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm13, %xmm1
9512; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9513; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9514; AVX-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
9515; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9516; AVX-NEXT:    vpshufb %xmm3, %xmm10, %xmm1
9517; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
9518; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9519; AVX-NEXT:    vpshufb %xmm4, %xmm9, %xmm1
9520; AVX-NEXT:    vmovdqa (%rsp), %xmm6 # 16-byte Reload
9521; AVX-NEXT:    vpshufb %xmm5, %xmm6, %xmm3
9522; AVX-NEXT:    vpor %xmm1, %xmm3, %xmm1
9523; AVX-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
9524; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9525; AVX-NEXT:    vmovq {{.*#+}} xmm1 = [0,0,2,9,128,128,128,0,0,0,0,0,0,0,0,0]
9526; AVX-NEXT:    vpshufb %xmm1, %xmm15, %xmm0
9527; AVX-NEXT:    vmovq {{.*#+}} xmm3 = [0,0,128,128,0,7,14,0,0,0,0,0,0,0,0,0]
9528; AVX-NEXT:    vpshufb %xmm3, %xmm11, %xmm4
9529; AVX-NEXT:    vpor %xmm0, %xmm4, %xmm5
9530; AVX-NEXT:    vbroadcastss {{.*#+}} xmm4 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128]
9531; AVX-NEXT:    vpshufb %xmm4, %xmm8, %xmm13
9532; AVX-NEXT:    vmovdqa %xmm8, %xmm11
9533; AVX-NEXT:    vbroadcastss {{.*#+}} xmm0 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5]
9534; AVX-NEXT:    vpshufb %xmm0, %xmm12, %xmm14
9535; AVX-NEXT:    vpor %xmm13, %xmm14, %xmm13
9536; AVX-NEXT:    vmovdqa %xmm2, %xmm8
9537; AVX-NEXT:    vpblendvb %xmm2, %xmm5, %xmm13, %xmm2
9538; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9539; AVX-NEXT:    vpshufb %xmm1, %xmm10, %xmm1
9540; AVX-NEXT:    vmovdqa %xmm10, %xmm2
9541; AVX-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
9542; AVX-NEXT:    vmovdqa %xmm7, %xmm10
9543; AVX-NEXT:    vpor %xmm1, %xmm3, %xmm1
9544; AVX-NEXT:    vpshufb %xmm4, %xmm9, %xmm3
9545; AVX-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
9546; AVX-NEXT:    vpor %xmm3, %xmm0, %xmm0
9547; AVX-NEXT:    vpblendvb %xmm8, %xmm1, %xmm0, %xmm14
9548; AVX-NEXT:    vmovq {{.*#+}} xmm1 = [0,0,3,10,128,128,128,0,0,0,0,0,0,0,0,0]
9549; AVX-NEXT:    vpshufb %xmm1, %xmm15, %xmm0
9550; AVX-NEXT:    vmovq {{.*#+}} xmm3 = [0,0,128,128,1,8,15,0,0,0,0,0,0,0,0,0]
9551; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9552; AVX-NEXT:    vpshufb %xmm3, %xmm7, %xmm4
9553; AVX-NEXT:    vpor %xmm0, %xmm4, %xmm4
9554; AVX-NEXT:    vbroadcastss {{.*#+}} xmm13 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128]
9555; AVX-NEXT:    vpshufb %xmm13, %xmm11, %xmm5
9556; AVX-NEXT:    vbroadcastss {{.*#+}} xmm0 = [13,128,128,6,13,128,128,6,13,128,128,6,13,128,128,6]
9557; AVX-NEXT:    vpshufb %xmm0, %xmm12, %xmm11
9558; AVX-NEXT:    vpor %xmm5, %xmm11, %xmm5
9559; AVX-NEXT:    vmovdqa %xmm8, %xmm11
9560; AVX-NEXT:    vpblendvb %xmm8, %xmm4, %xmm5, %xmm8
9561; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
9562; AVX-NEXT:    vpshufb %xmm3, %xmm10, %xmm3
9563; AVX-NEXT:    vpor %xmm1, %xmm3, %xmm1
9564; AVX-NEXT:    vpshufb %xmm13, %xmm9, %xmm3
9565; AVX-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
9566; AVX-NEXT:    vmovdqa %xmm6, %xmm9
9567; AVX-NEXT:    vpor %xmm3, %xmm0, %xmm0
9568; AVX-NEXT:    vpblendvb %xmm11, %xmm1, %xmm0, %xmm2
9569; AVX-NEXT:    vmovdqa 208(%rdi), %xmm6
9570; AVX-NEXT:    vmovdqa 192(%rdi), %xmm5
9571; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12]
9572; AVX-NEXT:    # xmm3 = mem[0,0]
9573; AVX-NEXT:    vpshufb %xmm3, %xmm6, %xmm4
9574; AVX-NEXT:    vmovddup {{.*#+}} xmm11 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128]
9575; AVX-NEXT:    # xmm11 = mem[0,0]
9576; AVX-NEXT:    vpshufb %xmm11, %xmm5, %xmm13
9577; AVX-NEXT:    vpor %xmm4, %xmm13, %xmm4
9578; AVX-NEXT:    vpmovsxdq {{.*#+}} xmm13 = [18446744073709486080,16777215]
9579; AVX-NEXT:    vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload
9580; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9581; AVX-NEXT:    vmovdqa 432(%rdi), %xmm4
9582; AVX-NEXT:    vpshufb %xmm3, %xmm4, %xmm0
9583; AVX-NEXT:    vmovdqa 416(%rdi), %xmm3
9584; AVX-NEXT:    vpshufb %xmm11, %xmm3, %xmm11
9585; AVX-NEXT:    vpor %xmm0, %xmm11, %xmm0
9586; AVX-NEXT:    vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9587; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9588; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13]
9589; AVX-NEXT:    # xmm1 = mem[0,0]
9590; AVX-NEXT:    vpshufb %xmm1, %xmm6, %xmm11
9591; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128]
9592; AVX-NEXT:    # xmm0 = mem[0,0]
9593; AVX-NEXT:    vpshufb %xmm0, %xmm5, %xmm15
9594; AVX-NEXT:    vpor %xmm11, %xmm15, %xmm11
9595; AVX-NEXT:    vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload
9596; AVX-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9597; AVX-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
9598; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
9599; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
9600; AVX-NEXT:    vpblendvb %xmm13, %xmm14, %xmm0, %xmm0
9601; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9602; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128]
9603; AVX-NEXT:    # xmm0 = mem[0,0]
9604; AVX-NEXT:    vpshufb %xmm0, %xmm5, %xmm1
9605; AVX-NEXT:    vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14]
9606; AVX-NEXT:    # xmm11 = mem[0,0]
9607; AVX-NEXT:    vpshufb %xmm11, %xmm6, %xmm14
9608; AVX-NEXT:    vpor %xmm1, %xmm14, %xmm1
9609; AVX-NEXT:    vpblendvb %xmm13, %xmm8, %xmm1, %xmm1
9610; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9611; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
9612; AVX-NEXT:    vpshufb %xmm11, %xmm4, %xmm1
9613; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
9614; AVX-NEXT:    vpblendvb %xmm13, %xmm2, %xmm0, %xmm0
9615; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9616; AVX-NEXT:    vmovd {{.*#+}} xmm14 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
9617; AVX-NEXT:    vpshufb %xmm14, %xmm7, %xmm1
9618; AVX-NEXT:    vmovd {{.*#+}} xmm0 = [0,0,4,11,0,0,0,0,0,0,0,0,0,0,0,0]
9619; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9620; AVX-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
9621; AVX-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
9622; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [128,5,12,0,0,0,128,128,128,5,12,0,0,0,128,128]
9623; AVX-NEXT:    # xmm1 = mem[0,0]
9624; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9625; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm8
9626; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = [14,128,128,0,0,0,0,7,14,128,128,0,0,0,0,7]
9627; AVX-NEXT:    # xmm2 = mem[0,0]
9628; AVX-NEXT:    vpshufb %xmm2, %xmm12, %xmm10
9629; AVX-NEXT:    vpor %xmm8, %xmm10, %xmm8
9630; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm7[1,2],xmm8[3,4,5,6,7]
9631; AVX-NEXT:    vmovddup {{.*#+}} xmm7 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128]
9632; AVX-NEXT:    # xmm7 = mem[0,0]
9633; AVX-NEXT:    vpshufb %xmm7, %xmm5, %xmm10
9634; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9635; AVX-NEXT:    vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15]
9636; AVX-NEXT:    # xmm11 = mem[0,0]
9637; AVX-NEXT:    vpshufb %xmm11, %xmm6, %xmm12
9638; AVX-NEXT:    vpor %xmm10, %xmm12, %xmm10
9639; AVX-NEXT:    vpblendvb %xmm13, %xmm8, %xmm10, %xmm8
9640; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9641; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9642; AVX-NEXT:    vpshufb %xmm0, %xmm8, %xmm0
9643; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9644; AVX-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
9645; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
9646; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9647; AVX-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
9648; AVX-NEXT:    vpshufb %xmm2, %xmm9, %xmm2
9649; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
9650; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3,4,5,6,7]
9651; AVX-NEXT:    vpshufb %xmm7, %xmm3, %xmm1
9652; AVX-NEXT:    vmovdqa %xmm4, %xmm8
9653; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9654; AVX-NEXT:    vpshufb %xmm11, %xmm4, %xmm2
9655; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
9656; AVX-NEXT:    vpblendvb %xmm13, %xmm0, %xmm1, %xmm0
9657; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
9658; AVX-NEXT:    vbroadcastss {{.*#+}} xmm14 = [0,0,2,9,0,0,2,9,0,0,2,9,0,0,2,9]
9659; AVX-NEXT:    vmovdqa %xmm6, %xmm13
9660; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9661; AVX-NEXT:    vpshufb %xmm14, %xmm6, %xmm1
9662; AVX-NEXT:    vbroadcastss {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
9663; AVX-NEXT:    vpshufb %xmm0, %xmm5, %xmm2
9664; AVX-NEXT:    vmovdqa %xmm0, %xmm5
9665; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
9666; AVX-NEXT:    vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
9667; AVX-NEXT:    # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7]
9668; AVX-NEXT:    vbroadcastss {{.*#+}} xmm15 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
9669; AVX-NEXT:    vmovdqa 80(%rdi), %xmm4
9670; AVX-NEXT:    vpshufb %xmm15, %xmm4, %xmm7
9671; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9672; AVX-NEXT:    vbroadcastss {{.*#+}} xmm6 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13]
9673; AVX-NEXT:    vmovdqa 64(%rdi), %xmm0
9674; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9675; AVX-NEXT:    vpshufb %xmm6, %xmm0, %xmm10
9676; AVX-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1]
9677; AVX-NEXT:    vmovdqa 96(%rdi), %xmm0
9678; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9679; AVX-NEXT:    vpalignr {{.*#+}} xmm7 = xmm7[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
9680; AVX-NEXT:    vmovddup {{.*#+}} xmm11 = [0,0,0,1,2,3,8,15,0,0,0,1,2,3,8,15]
9681; AVX-NEXT:    # xmm11 = mem[0,0]
9682; AVX-NEXT:    vpshufb %xmm11, %xmm7, %xmm10
9683; AVX-NEXT:    vmovd {{.*#+}} xmm0 = [0,7,14,0,0,0,0,0,0,0,0,0,0,0,0,0]
9684; AVX-NEXT:    vmovdqa 112(%rdi), %xmm2
9685; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9686; AVX-NEXT:    vpshufb %xmm0, %xmm2, %xmm12
9687; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm10, %ymm10
9688; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
9689; AVX-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload
9690; AVX-NEXT:    vandnps %ymm10, %ymm2, %ymm10
9691; AVX-NEXT:    vorps %ymm10, %ymm12, %ymm10
9692; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
9693; AVX-NEXT:    vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
9694; AVX-NEXT:    vandnps %ymm1, %ymm12, %ymm1
9695; AVX-NEXT:    vandps %ymm12, %ymm10, %ymm10
9696; AVX-NEXT:    vorps %ymm1, %ymm10, %ymm1
9697; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9698; AVX-NEXT:    vpshufb %xmm14, %xmm8, %xmm1
9699; AVX-NEXT:    vpshufb %xmm5, %xmm3, %xmm10
9700; AVX-NEXT:    vmovdqa %xmm3, %xmm5
9701; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9702; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3]
9703; AVX-NEXT:    vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
9704; AVX-NEXT:    # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7]
9705; AVX-NEXT:    vmovdqa 304(%rdi), %xmm3
9706; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9707; AVX-NEXT:    vpshufb %xmm15, %xmm3, %xmm14
9708; AVX-NEXT:    vmovdqa 288(%rdi), %xmm3
9709; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9710; AVX-NEXT:    vpshufb %xmm6, %xmm3, %xmm15
9711; AVX-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
9712; AVX-NEXT:    vmovdqa 320(%rdi), %xmm9
9713; AVX-NEXT:    vpalignr {{.*#+}} xmm14 = xmm14[10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7,8,9]
9714; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9715; AVX-NEXT:    vpshufb %xmm11, %xmm14, %xmm11
9716; AVX-NEXT:    vmovdqa 336(%rdi), %xmm8
9717; AVX-NEXT:    vpshufb %xmm0, %xmm8, %xmm0
9718; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9719; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm11, %ymm0
9720; AVX-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload
9721; AVX-NEXT:    vandnps %ymm0, %ymm2, %ymm0
9722; AVX-NEXT:    vorps %ymm0, %ymm11, %ymm0
9723; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
9724; AVX-NEXT:    vandnps %ymm1, %ymm12, %ymm1
9725; AVX-NEXT:    vandps %ymm0, %ymm12, %ymm0
9726; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
9727; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9728; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
9729; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9730; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm6[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
9731; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3]
9732; AVX-NEXT:    vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
9733; AVX-NEXT:    # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7]
9734; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u]
9735; AVX-NEXT:    vbroadcastss {{.*#+}} xmm7 = [0,0,7,14,0,0,7,14,0,0,7,14,0,0,7,14]
9736; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9737; AVX-NEXT:    vpshufb %xmm7, %xmm15, %xmm10
9738; AVX-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1]
9739; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
9740; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6],xmm0[7]
9741; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = [0,128,128,128,128,128,3,10,0,128,128,128,128,128,3,10]
9742; AVX-NEXT:    # xmm2 = mem[0,0]
9743; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9744; AVX-NEXT:    vpshufb %xmm2, %xmm4, %xmm11
9745; AVX-NEXT:    vpor %xmm11, %xmm10, %xmm10
9746; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9747; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm3[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u]
9748; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm10, %ymm10
9749; AVX-NEXT:    vmovaps {{.*#+}} ymm14 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
9750; AVX-NEXT:    vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
9751; AVX-NEXT:    vandps %ymm14, %ymm10, %ymm10
9752; AVX-NEXT:    vorps %ymm0, %ymm10, %ymm0
9753; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
9754; AVX-NEXT:    vandnps %ymm1, %ymm12, %ymm1
9755; AVX-NEXT:    vandps %ymm0, %ymm12, %ymm0
9756; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
9757; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9758; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9759; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm13[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
9760; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
9761; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9762; AVX-NEXT:    vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9763; AVX-NEXT:    # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7]
9764; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9765; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u]
9766; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9767; AVX-NEXT:    vpshufb %xmm7, %xmm11, %xmm7
9768; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
9769; AVX-NEXT:    vpxor %xmm7, %xmm7, %xmm7
9770; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm7[7]
9771; AVX-NEXT:    vpshufb %xmm2, %xmm9, %xmm2
9772; AVX-NEXT:    vpor %xmm2, %xmm1, %xmm1
9773; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm8[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u]
9774; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
9775; AVX-NEXT:    vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload
9776; AVX-NEXT:    vandps %ymm1, %ymm14, %ymm1
9777; AVX-NEXT:    vorps %ymm2, %ymm1, %ymm1
9778; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
9779; AVX-NEXT:    vandnps %ymm0, %ymm12, %ymm0
9780; AVX-NEXT:    vandps %ymm1, %ymm12, %ymm1
9781; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
9782; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9783; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
9784; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9785; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
9786; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13]
9787; AVX-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
9788; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9789; AVX-NEXT:    vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9790; AVX-NEXT:    # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7]
9791; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9792; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm10[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u]
9793; AVX-NEXT:    vbroadcastss {{.*#+}} xmm14 = [0,1,8,15,0,1,8,15,0,1,8,15,0,1,8,15]
9794; AVX-NEXT:    vpshufb %xmm14, %xmm15, %xmm6
9795; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm1[2],xmm6[3],xmm1[3]
9796; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm7[7]
9797; AVX-NEXT:    vmovddup {{.*#+}} xmm6 = [0,128,128,128,128,128,4,11,0,128,128,128,128,128,4,11]
9798; AVX-NEXT:    # xmm6 = mem[0,0]
9799; AVX-NEXT:    vmovdqa %xmm4, %xmm12
9800; AVX-NEXT:    vpshufb %xmm6, %xmm4, %xmm7
9801; AVX-NEXT:    vpor %xmm7, %xmm1, %xmm1
9802; AVX-NEXT:    vmovd {{.*#+}} xmm9 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
9803; AVX-NEXT:    vpshufb %xmm9, %xmm3, %xmm7
9804; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm1, %ymm7
9805; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
9806; AVX-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
9807; AVX-NEXT:    vandnps %ymm7, %ymm1, %ymm7
9808; AVX-NEXT:    vorps %ymm7, %ymm8, %ymm7
9809; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm8
9810; AVX-NEXT:    vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
9811; AVX-NEXT:    vandnps %ymm8, %ymm15, %ymm8
9812; AVX-NEXT:    vandps %ymm7, %ymm15, %ymm7
9813; AVX-NEXT:    vorps %ymm7, %ymm8, %ymm0
9814; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9815; AVX-NEXT:    vpshufb %xmm2, %xmm13, %xmm4
9816; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9817; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
9818; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
9819; AVX-NEXT:    vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
9820; AVX-NEXT:    # xmm3 = mem[0,1,2,3,4,5],xmm3[6,7]
9821; AVX-NEXT:    vmovdqa %xmm5, %xmm0
9822; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u]
9823; AVX-NEXT:    vmovdqa %xmm11, %xmm1
9824; AVX-NEXT:    vpshufb %xmm14, %xmm11, %xmm4
9825; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
9826; AVX-NEXT:    vxorps %xmm7, %xmm7, %xmm7
9827; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm7[7]
9828; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9829; AVX-NEXT:    vpshufb %xmm6, %xmm13, %xmm4
9830; AVX-NEXT:    vpor %xmm4, %xmm2, %xmm2
9831; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9832; AVX-NEXT:    vpshufb %xmm9, %xmm14, %xmm4
9833; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
9834; AVX-NEXT:    vmovaps {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
9835; AVX-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload
9836; AVX-NEXT:    vandnps %ymm2, %ymm5, %ymm2
9837; AVX-NEXT:    vorps %ymm2, %ymm4, %ymm2
9838; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
9839; AVX-NEXT:    vandnps %ymm3, %ymm15, %ymm3
9840; AVX-NEXT:    vandps %ymm2, %ymm15, %ymm2
9841; AVX-NEXT:    vorps %ymm3, %ymm2, %ymm2
9842; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9843; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = [0,2,9,128,128,128,0,0,0,2,9,128,128,128,0,0]
9844; AVX-NEXT:    # xmm3 = mem[0,0]
9845; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9846; AVX-NEXT:    vpshufb %xmm3, %xmm9, %xmm2
9847; AVX-NEXT:    vmovddup {{.*#+}} xmm4 = [0,128,128,0,7,14,0,0,0,128,128,0,7,14,0,0]
9848; AVX-NEXT:    # xmm4 = mem[0,0]
9849; AVX-NEXT:    vpshufb %xmm4, %xmm10, %xmm5
9850; AVX-NEXT:    vpor %xmm2, %xmm5, %xmm2
9851; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm7[7]
9852; AVX-NEXT:    vmovddup {{.*#+}} xmm5 = [0,128,128,128,128,128,5,12,0,128,128,128,128,128,5,12]
9853; AVX-NEXT:    # xmm5 = mem[0,0]
9854; AVX-NEXT:    vpshufb %xmm5, %xmm12, %xmm6
9855; AVX-NEXT:    vmovdqa %xmm12, %xmm11
9856; AVX-NEXT:    vpor %xmm6, %xmm2, %xmm6
9857; AVX-NEXT:    vmovd {{.*#+}} xmm8 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
9858; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9859; AVX-NEXT:    vpshufb %xmm8, %xmm2, %xmm7
9860; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm6, %ymm6
9861; AVX-NEXT:    vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
9862; AVX-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload
9863; AVX-NEXT:    vandnps %ymm6, %ymm12, %ymm6
9864; AVX-NEXT:    vorps %ymm6, %ymm7, %ymm6
9865; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload
9866; AVX-NEXT:    vandnps %ymm7, %ymm15, %ymm7
9867; AVX-NEXT:    vandps %ymm6, %ymm15, %ymm6
9868; AVX-NEXT:    vorps %ymm7, %ymm6, %ymm6
9869; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9870; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
9871; AVX-NEXT:    vpshufb %xmm4, %xmm0, %xmm4
9872; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
9873; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],mem[7]
9874; AVX-NEXT:    vpshufb %xmm5, %xmm13, %xmm4
9875; AVX-NEXT:    vpor %xmm4, %xmm3, %xmm3
9876; AVX-NEXT:    vpshufb %xmm8, %xmm14, %xmm4
9877; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
9878; AVX-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload
9879; AVX-NEXT:    vandnps %ymm3, %ymm12, %ymm3
9880; AVX-NEXT:    vorps %ymm3, %ymm4, %ymm3
9881; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
9882; AVX-NEXT:    vandnps %ymm4, %ymm15, %ymm4
9883; AVX-NEXT:    vandps %ymm3, %ymm15, %ymm3
9884; AVX-NEXT:    vorps %ymm4, %ymm3, %ymm0
9885; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9886; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9887; AVX-NEXT:    vmovd {{.*#+}} xmm14 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
9888; AVX-NEXT:    vpshufb %xmm14, %xmm8, %xmm3
9889; AVX-NEXT:    vmovd {{.*#+}} xmm0 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
9890; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9891; AVX-NEXT:    vpshufb %xmm0, %xmm13, %xmm4
9892; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
9893; AVX-NEXT:    vmovddup {{.*#+}} xmm4 = [12,0,0,0,128,128,128,5,12,0,0,0,128,128,128,5]
9894; AVX-NEXT:    # xmm4 = mem[0,0]
9895; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9896; AVX-NEXT:    vpshufb %xmm4, %xmm12, %xmm5
9897; AVX-NEXT:    vmovddup {{.*#+}} xmm6 = [128,0,0,0,0,7,14,128,128,0,0,0,0,7,14,128]
9898; AVX-NEXT:    # xmm6 = mem[0,0]
9899; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9900; AVX-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
9901; AVX-NEXT:    vpor %xmm5, %xmm7, %xmm5
9902; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3,4,5,6,7]
9903; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[u,u]
9904; AVX-NEXT:    vmovddup {{.*#+}} xmm5 = [0,128,128,1,8,15,0,0,0,128,128,1,8,15,0,0]
9905; AVX-NEXT:    # xmm5 = mem[0,0]
9906; AVX-NEXT:    vpshufb %xmm5, %xmm10, %xmm9
9907; AVX-NEXT:    vpor %xmm7, %xmm9, %xmm7
9908; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6],mem[7]
9909; AVX-NEXT:    vmovddup {{.*#+}} xmm9 = [0,128,128,128,128,128,6,13,0,128,128,128,128,128,6,13]
9910; AVX-NEXT:    # xmm9 = mem[0,0]
9911; AVX-NEXT:    vpshufb %xmm9, %xmm11, %xmm10
9912; AVX-NEXT:    vpor %xmm7, %xmm10, %xmm7
9913; AVX-NEXT:    vmovdqa %xmm0, %xmm11
9914; AVX-NEXT:    vpshufb %xmm0, %xmm2, %xmm10
9915; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm7, %ymm7
9916; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
9917; AVX-NEXT:    vandps %ymm0, %ymm3, %ymm3
9918; AVX-NEXT:    vandnps %ymm7, %ymm0, %ymm7
9919; AVX-NEXT:    vorps %ymm7, %ymm3, %ymm3
9920; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload
9921; AVX-NEXT:    vandnps %ymm7, %ymm15, %ymm7
9922; AVX-NEXT:    vandps %ymm3, %ymm15, %ymm3
9923; AVX-NEXT:    vorps %ymm7, %ymm3, %ymm0
9924; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9925; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9926; AVX-NEXT:    vpshufb %xmm14, %xmm10, %xmm3
9927; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9928; AVX-NEXT:    vpshufb %xmm11, %xmm14, %xmm7
9929; AVX-NEXT:    vmovdqa %xmm11, %xmm2
9930; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
9931; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9932; AVX-NEXT:    vpshufb %xmm4, %xmm11, %xmm4
9933; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9934; AVX-NEXT:    vpshufb %xmm6, %xmm7, %xmm6
9935; AVX-NEXT:    vpor %xmm4, %xmm6, %xmm4
9936; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7]
9937; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9938; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm0[u,u]
9939; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9940; AVX-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
9941; AVX-NEXT:    vpor %xmm4, %xmm5, %xmm4
9942; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],mem[7]
9943; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9944; AVX-NEXT:    vpshufb %xmm9, %xmm5, %xmm5
9945; AVX-NEXT:    vpor %xmm5, %xmm4, %xmm4
9946; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9947; AVX-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
9948; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
9949; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
9950; AVX-NEXT:    vandps %ymm2, %ymm3, %ymm3
9951; AVX-NEXT:    vandnps %ymm4, %ymm2, %ymm4
9952; AVX-NEXT:    vorps %ymm4, %ymm3, %ymm3
9953; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
9954; AVX-NEXT:    vandnps %ymm4, %ymm15, %ymm4
9955; AVX-NEXT:    vandps %ymm3, %ymm15, %ymm3
9956; AVX-NEXT:    vorps %ymm4, %ymm3, %ymm3
9957; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9958; AVX-NEXT:    vmovd {{.*#+}} xmm2 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
9959; AVX-NEXT:    vpshufb %xmm2, %xmm8, %xmm3
9960; AVX-NEXT:    vmovd {{.*#+}} xmm8 = [5,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
9961; AVX-NEXT:    vpshufb %xmm8, %xmm13, %xmm4
9962; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
9963; AVX-NEXT:    vmovddup {{.*#+}} xmm4 = [13,0,0,0,128,128,128,6,13,0,0,0,128,128,128,6]
9964; AVX-NEXT:    # xmm4 = mem[0,0]
9965; AVX-NEXT:    vpshufb %xmm4, %xmm12, %xmm5
9966; AVX-NEXT:    vmovddup {{.*#+}} xmm6 = [128,0,0,0,1,8,15,128,128,0,0,0,1,8,15,128]
9967; AVX-NEXT:    # xmm6 = mem[0,0]
9968; AVX-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
9969; AVX-NEXT:    vpor %xmm5, %xmm7, %xmm5
9970; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3,4,5,6,7]
9971; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9972; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[2,9,u,u,u]
9973; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9974; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm12[u,u,u]
9975; AVX-NEXT:    vpor %xmm3, %xmm9, %xmm9
9976; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero
9977; AVX-NEXT:    vmovddup {{.*#+}} xmm7 = [0,128,128,128,128,0,7,14,0,128,128,128,128,0,7,14]
9978; AVX-NEXT:    # xmm7 = mem[0,0]
9979; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9980; AVX-NEXT:    vpshufb %xmm7, %xmm1, %xmm13
9981; AVX-NEXT:    vpor %xmm13, %xmm9, %xmm9
9982; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9983; AVX-NEXT:    vpshufb %xmm8, %xmm3, %xmm13
9984; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm9, %ymm9
9985; AVX-NEXT:    vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
9986; AVX-NEXT:    vandps %ymm5, %ymm13, %ymm5
9987; AVX-NEXT:    vandnps %ymm9, %ymm13, %ymm9
9988; AVX-NEXT:    vorps %ymm5, %ymm9, %ymm5
9989; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload
9990; AVX-NEXT:    vandnps %ymm9, %ymm15, %ymm9
9991; AVX-NEXT:    vandps %ymm5, %ymm15, %ymm5
9992; AVX-NEXT:    vorps %ymm5, %ymm9, %ymm1
9993; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9994; AVX-NEXT:    vpshufb %xmm2, %xmm10, %xmm2
9995; AVX-NEXT:    vpshufb %xmm8, %xmm14, %xmm9
9996; AVX-NEXT:    vmovdqa %xmm8, %xmm14
9997; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
9998; AVX-NEXT:    vpshufb %xmm4, %xmm11, %xmm4
9999; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
10000; AVX-NEXT:    vpshufb %xmm6, %xmm8, %xmm6
10001; AVX-NEXT:    vpor %xmm4, %xmm6, %xmm4
10002; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3,4,5,6,7]
10003; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9,u,u,u]
10004; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
10005; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm10[u,u,u]
10006; AVX-NEXT:    vpor %xmm4, %xmm6, %xmm4
10007; AVX-NEXT:    vmovddup {{.*#+}} xmm11 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128]
10008; AVX-NEXT:    # xmm11 = mem[0,0]
10009; AVX-NEXT:    vpshufb %xmm11, %xmm4, %xmm4
10010; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10011; AVX-NEXT:    vpshufb %xmm7, %xmm1, %xmm6
10012; AVX-NEXT:    vpor %xmm6, %xmm4, %xmm4
10013; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10014; AVX-NEXT:    vpshufb %xmm14, %xmm5, %xmm6
10015; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm4, %ymm4
10016; AVX-NEXT:    vandps %ymm2, %ymm13, %ymm2
10017; AVX-NEXT:    vandnps %ymm4, %ymm13, %ymm4
10018; AVX-NEXT:    vorps %ymm4, %ymm2, %ymm2
10019; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
10020; AVX-NEXT:    vandnps %ymm4, %ymm15, %ymm4
10021; AVX-NEXT:    vandps %ymm2, %ymm15, %ymm2
10022; AVX-NEXT:    vorps %ymm4, %ymm2, %ymm0
10023; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10024; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10025; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
10026; AVX-NEXT:    vmovd {{.*#+}} xmm2 = [6,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
10027; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10028; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm6
10029; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
10030; AVX-NEXT:    vmovddup {{.*#+}} xmm6 = [128,0,0,0,2,9,128,128,128,0,0,0,2,9,128,128]
10031; AVX-NEXT:    # xmm6 = mem[0,0]
10032; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10033; AVX-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
10034; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = [14,0,0,0,128,128,0,7,14,0,0,0,128,128,0,7]
10035; AVX-NEXT:    # xmm0 = mem[0,0]
10036; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10037; AVX-NEXT:    vpshufb %xmm0, %xmm9, %xmm9
10038; AVX-NEXT:    vpor %xmm7, %xmm9, %xmm7
10039; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3,4,5,6,7]
10040; AVX-NEXT:    vbroadcastss {{.*#+}} xmm7 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3]
10041; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10042; AVX-NEXT:    vpshufb %xmm7, %xmm9, %xmm9
10043; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm12[u,u,u]
10044; AVX-NEXT:    vpor %xmm9, %xmm13, %xmm9
10045; AVX-NEXT:    vpshufb %xmm11, %xmm9, %xmm9
10046; AVX-NEXT:    vmovddup {{.*#+}} xmm13 = [0,128,128,128,128,1,8,15,0,128,128,128,128,1,8,15]
10047; AVX-NEXT:    # xmm13 = mem[0,0]
10048; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10049; AVX-NEXT:    vpshufb %xmm13, %xmm11, %xmm14
10050; AVX-NEXT:    vpor %xmm14, %xmm9, %xmm9
10051; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm14
10052; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm9, %ymm9
10053; AVX-NEXT:    vmovaps {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
10054; AVX-NEXT:    vandps %ymm4, %ymm11, %ymm4
10055; AVX-NEXT:    vandnps %ymm9, %ymm11, %ymm9
10056; AVX-NEXT:    vorps %ymm4, %ymm9, %ymm4
10057; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload
10058; AVX-NEXT:    vandnps %ymm9, %ymm15, %ymm9
10059; AVX-NEXT:    vandps %ymm4, %ymm15, %ymm4
10060; AVX-NEXT:    vorps %ymm4, %ymm9, %ymm4
10061; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10062; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = xmm3[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
10063; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10064; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm12
10065; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
10066; AVX-NEXT:    vpshufb %xmm6, %xmm8, %xmm6
10067; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10068; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm8
10069; AVX-NEXT:    vpor %xmm6, %xmm8, %xmm6
10070; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3,4,5,6,7]
10071; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10072; AVX-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
10073; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm10[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm10[u,u,u]
10074; AVX-NEXT:    vpor %xmm7, %xmm8, %xmm7
10075; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero
10076; AVX-NEXT:    vpshufb %xmm13, %xmm1, %xmm7
10077; AVX-NEXT:    vpor %xmm7, %xmm3, %xmm3
10078; AVX-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
10079; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
10080; AVX-NEXT:    vandps %ymm6, %ymm11, %ymm3
10081; AVX-NEXT:    vandnps %ymm2, %ymm11, %ymm1
10082; AVX-NEXT:    vorps %ymm1, %ymm3, %ymm1
10083; AVX-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm2 # 16-byte Folded Reload
10084; AVX-NEXT:    vandnps %ymm2, %ymm15, %ymm2
10085; AVX-NEXT:    vandps %ymm1, %ymm15, %ymm0
10086; AVX-NEXT:    vorps %ymm2, %ymm0, %ymm0
10087; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10088; AVX-NEXT:    vmovaps %ymm1, 32(%rsi)
10089; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10090; AVX-NEXT:    vmovaps %ymm1, (%rsi)
10091; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10092; AVX-NEXT:    vmovaps %ymm1, 32(%rdx)
10093; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10094; AVX-NEXT:    vmovaps %ymm1, (%rdx)
10095; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10096; AVX-NEXT:    vmovaps %ymm1, 32(%rcx)
10097; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10098; AVX-NEXT:    vmovaps %ymm1, (%rcx)
10099; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10100; AVX-NEXT:    vmovaps %ymm1, 32(%r8)
10101; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10102; AVX-NEXT:    vmovaps %ymm1, (%r8)
10103; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10104; AVX-NEXT:    vmovaps %ymm1, 32(%r9)
10105; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10106; AVX-NEXT:    vmovaps %ymm1, (%r9)
10107; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10108; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10109; AVX-NEXT:    vmovaps %ymm1, 32(%rax)
10110; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10111; AVX-NEXT:    vmovaps %ymm1, (%rax)
10112; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10113; AVX-NEXT:    vmovaps %ymm0, 32(%rax)
10114; AVX-NEXT:    vmovaps %ymm4, (%rax)
10115; AVX-NEXT:    addq $744, %rsp # imm = 0x2E8
10116; AVX-NEXT:    vzeroupper
10117; AVX-NEXT:    retq
10118;
10119; AVX2-LABEL: load_i8_stride7_vf64:
10120; AVX2:       # %bb.0:
10121; AVX2-NEXT:    subq $760, %rsp # imm = 0x2F8
10122; AVX2-NEXT:    vmovdqa 320(%rdi), %ymm6
10123; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm7
10124; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm8
10125; AVX2-NEXT:    vmovdqa (%rdi), %ymm12
10126; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm10
10127; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm11
10128; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm5
10129; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
10130; AVX2-NEXT:    vpblendvb %ymm13, %ymm12, %ymm10, %ymm0
10131; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10132; AVX2-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10133; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
10134; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u]
10135; AVX2-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
10136; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u]
10137; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
10138; AVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
10139; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
10140; AVX2-NEXT:    vpblendvb %ymm14, %ymm5, %ymm11, %ymm3
10141; AVX2-NEXT:    vmovdqa %ymm5, %ymm9
10142; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10143; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10144; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
10145; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
10146; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9]
10147; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
10148; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm5
10149; AVX2-NEXT:    vpmovsxbw {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0]
10150; AVX2-NEXT:    vpblendvb %ymm4, %ymm0, %ymm5, %ymm0
10151; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10152; AVX2-NEXT:    vpblendvb %ymm13, %ymm7, %ymm8, %ymm5
10153; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10154; AVX2-NEXT:    vmovdqa %ymm7, %ymm0
10155; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10156; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm7
10157; AVX2-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
10158; AVX2-NEXT:    vmovdqa 288(%rdi), %ymm15
10159; AVX2-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
10160; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
10161; AVX2-NEXT:    vpblendvb %ymm14, %ymm6, %ymm15, %ymm2
10162; AVX2-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10163; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10164; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm5
10165; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15]
10166; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
10167; AVX2-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
10168; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10169; AVX2-NEXT:    vpblendvb %ymm13, %ymm11, %ymm9, %ymm2
10170; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
10171; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15]
10172; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
10173; AVX2-NEXT:    vpblendvb %ymm1, %ymm12, %ymm10, %ymm5
10174; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm3
10175; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u]
10176; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm7
10177; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u]
10178; AVX2-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
10179; AVX2-NEXT:    vpor %xmm7, %xmm5, %xmm5
10180; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10]
10181; AVX2-NEXT:    # ymm7 = mem[0,1,0,1]
10182; AVX2-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
10183; AVX2-NEXT:    vpmovsxdq {{.*#+}} ymm9 = [0,18446744073709551360,16777215,0]
10184; AVX2-NEXT:    vpblendvb %ymm9, %ymm4, %ymm5, %ymm4
10185; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10186; AVX2-NEXT:    vpblendvb %ymm13, %ymm15, %ymm6, %ymm4
10187; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm5
10188; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7,8,9],ymm5[10],ymm4[11,12,13],ymm5[14],ymm4[15]
10189; AVX2-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
10190; AVX2-NEXT:    vpblendvb %ymm1, %ymm0, %ymm8, %ymm5
10191; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm7
10192; AVX2-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
10193; AVX2-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
10194; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
10195; AVX2-NEXT:    vpblendvb %ymm9, %ymm4, %ymm2, %ymm0
10196; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10197; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm9
10198; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm15
10199; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
10200; AVX2-NEXT:    vpblendvb %ymm4, %ymm9, %ymm15, %ymm2
10201; AVX2-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10202; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
10203; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,128,128,3,10,128,128,128,6,13,u,u,u,u]
10204; AVX2-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
10205; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,5,12,128,128,1,8,15,128,128,u,u,u,u]
10206; AVX2-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
10207; AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
10208; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
10209; AVX2-NEXT:    vmovdqa 208(%rdi), %xmm5
10210; AVX2-NEXT:    vpshufb %xmm12, %xmm5, %xmm3
10211; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
10212; AVX2-NEXT:    vmovdqa 192(%rdi), %xmm1
10213; AVX2-NEXT:    vpshufb %xmm11, %xmm1, %xmm10
10214; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10215; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3]
10216; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
10217; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
10218; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
10219; AVX2-NEXT:    vpmovsxdq {{.*#+}} ymm14 = [18446744073709551615,18446744073709551615,16777215,0]
10220; AVX2-NEXT:    vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
10221; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10222; AVX2-NEXT:    vmovdqa 384(%rdi), %ymm2
10223; AVX2-NEXT:    vmovdqa 352(%rdi), %ymm3
10224; AVX2-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm0
10225; AVX2-NEXT:    vmovdqa %ymm2, %ymm4
10226; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm6
10227; AVX2-NEXT:    vpshufb %xmm8, %xmm6, %xmm6
10228; AVX2-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
10229; AVX2-NEXT:    vpor %xmm6, %xmm0, %xmm0
10230; AVX2-NEXT:    vmovdqa 432(%rdi), %xmm13
10231; AVX2-NEXT:    vpshufb %xmm12, %xmm13, %xmm6
10232; AVX2-NEXT:    vmovdqa 416(%rdi), %xmm2
10233; AVX2-NEXT:    vpshufb %xmm11, %xmm2, %xmm8
10234; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
10235; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
10236; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
10237; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7]
10238; AVX2-NEXT:    vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10239; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10240; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
10241; AVX2-NEXT:    vpblendvb %ymm7, %ymm9, %ymm15, %ymm0
10242; AVX2-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,u,6,13,128,128,2,9,128,128,128,u,u,u,u]
10243; AVX2-NEXT:    vpshufb %xmm10, %xmm0, %xmm6
10244; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
10245; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,128,128,4,11,128,128,0,7,14,u,u,u,u]
10246; AVX2-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
10247; AVX2-NEXT:    vpor %xmm6, %xmm0, %xmm0
10248; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
10249; AVX2-NEXT:    vpshufb %xmm6, %xmm5, %xmm9
10250; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12]
10251; AVX2-NEXT:    vpshufb %xmm12, %xmm1, %xmm11
10252; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
10253; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
10254; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
10255; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7]
10256; AVX2-NEXT:    vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10257; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10258; AVX2-NEXT:    vpblendvb %ymm7, %ymm4, %ymm3, %ymm0
10259; AVX2-NEXT:    vmovdqa %ymm3, %ymm11
10260; AVX2-NEXT:    vpshufb %xmm10, %xmm0, %xmm1
10261; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
10262; AVX2-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
10263; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
10264; AVX2-NEXT:    vpshufb %xmm6, %xmm13, %xmm1
10265; AVX2-NEXT:    vpshufb %xmm12, %xmm2, %xmm6
10266; AVX2-NEXT:    vmovdqa %xmm2, %xmm12
10267; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
10268; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
10269; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10270; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
10271; AVX2-NEXT:    vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10272; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10273; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
10274; AVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm4, %ymm0
10275; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10276; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
10277; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,128,128,128,6,13,128,128,2,9,u,u,u,u,u]
10278; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
10279; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,1,8,15,128,128,4,11,128,128,u,u,u,u,u]
10280; AVX2-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
10281; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
10282; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12]
10283; AVX2-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10284; AVX2-NEXT:    vpshufb %xmm1, %xmm13, %xmm8
10285; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128]
10286; AVX2-NEXT:    vpshufb %xmm9, %xmm12, %xmm10
10287; AVX2-NEXT:    vmovdqa %xmm12, %xmm3
10288; AVX2-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10289; AVX2-NEXT:    vpor %xmm8, %xmm10, %xmm8
10290; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm10
10291; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
10292; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
10293; AVX2-NEXT:    # ymm0 = mem[0,1,0,1]
10294; AVX2-NEXT:    vpblendvb %ymm0, %ymm10, %ymm8, %ymm8
10295; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10296; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
10297; AVX2-NEXT:    vpblendvb %ymm2, %ymm15, %ymm12, %ymm8
10298; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm10
10299; AVX2-NEXT:    vpshufb %xmm6, %xmm10, %xmm6
10300; AVX2-NEXT:    vpshufb %xmm7, %xmm8, %xmm7
10301; AVX2-NEXT:    vpor %xmm6, %xmm7, %xmm6
10302; AVX2-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
10303; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10304; AVX2-NEXT:    vpshufb %xmm9, %xmm14, %xmm7
10305; AVX2-NEXT:    vpor %xmm1, %xmm7, %xmm1
10306; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
10307; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10308; AVX2-NEXT:    vpblendvb %ymm0, %ymm6, %ymm1, %ymm1
10309; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10310; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
10311; AVX2-NEXT:    vpblendvb %ymm2, %ymm11, %ymm4, %ymm1
10312; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,2,9,128,128,128,5,12,128,128,u,u,u,u,u]
10313; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
10314; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
10315; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u]
10316; AVX2-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
10317; AVX2-NEXT:    vpor %xmm7, %xmm1, %xmm1
10318; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13]
10319; AVX2-NEXT:    vpshufb %xmm7, %xmm13, %xmm9
10320; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128]
10321; AVX2-NEXT:    vpshufb %xmm10, %xmm3, %xmm11
10322; AVX2-NEXT:    vpor %xmm9, %xmm11, %xmm9
10323; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10324; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
10325; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm9, %ymm1
10326; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10327; AVX2-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10328; AVX2-NEXT:    vpblendvb %ymm2, %ymm15, %ymm12, %ymm1
10329; AVX2-NEXT:    vmovdqa %ymm12, %ymm2
10330; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
10331; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
10332; AVX2-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
10333; AVX2-NEXT:    vpor %xmm6, %xmm1, %xmm1
10334; AVX2-NEXT:    vpshufb %xmm7, %xmm5, %xmm6
10335; AVX2-NEXT:    vmovdqa %xmm5, %xmm13
10336; AVX2-NEXT:    vpshufb %xmm10, %xmm14, %xmm7
10337; AVX2-NEXT:    vpor %xmm6, %xmm7, %xmm6
10338; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10339; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
10340; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm6, %ymm1
10341; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10342; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535]
10343; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10344; AVX2-NEXT:    vpblendvb %ymm6, %ymm3, %ymm4, %ymm1
10345; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,u,3,10,128,128,128,6,13,128,128,u,u,u,u,u]
10346; AVX2-NEXT:    vpshufb %xmm5, %xmm1, %xmm7
10347; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
10348; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u]
10349; AVX2-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
10350; AVX2-NEXT:    vpor %xmm7, %xmm1, %xmm1
10351; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128]
10352; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
10353; AVX2-NEXT:    vpshufb %xmm7, %xmm12, %xmm9
10354; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14]
10355; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10356; AVX2-NEXT:    vpshufb %xmm10, %xmm14, %xmm11
10357; AVX2-NEXT:    vpor %xmm9, %xmm11, %xmm9
10358; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10359; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
10360; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm9, %ymm1
10361; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10362; AVX2-NEXT:    vpblendvb %ymm6, %ymm15, %ymm2, %ymm1
10363; AVX2-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
10364; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
10365; AVX2-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
10366; AVX2-NEXT:    vpor %xmm6, %xmm1, %xmm1
10367; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10368; AVX2-NEXT:    vpshufb %xmm7, %xmm11, %xmm6
10369; AVX2-NEXT:    vpshufb %xmm10, %xmm13, %xmm7
10370; AVX2-NEXT:    vmovdqa %xmm13, (%rsp) # 16-byte Spill
10371; AVX2-NEXT:    vpor %xmm6, %xmm7, %xmm6
10372; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10373; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
10374; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm6, %ymm1
10375; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10376; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
10377; AVX2-NEXT:    vpblendvb %ymm10, %ymm3, %ymm4, %ymm2
10378; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10379; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
10380; AVX2-NEXT:    vpblendvb %ymm15, %ymm4, %ymm3, %ymm1
10381; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
10382; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,128,128,2,9,128,128,128,5,12,u,u,u,u,u]
10383; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
10384; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,4,11,128,128,0,7,14,128,128,u,u,u,u,u]
10385; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
10386; AVX2-NEXT:    vpor %xmm2, %xmm1, %xmm1
10387; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128]
10388; AVX2-NEXT:    vpshufb %xmm2, %xmm12, %xmm7
10389; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15]
10390; AVX2-NEXT:    vpshufb %xmm8, %xmm14, %xmm9
10391; AVX2-NEXT:    vpor %xmm7, %xmm9, %xmm7
10392; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10393; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
10394; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm7, %ymm1
10395; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10396; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10397; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10398; AVX2-NEXT:    vpblendvb %ymm10, %ymm4, %ymm1, %ymm5
10399; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10400; AVX2-NEXT:    vpblendvb %ymm15, %ymm1, %ymm4, %ymm1
10401; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
10402; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
10403; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
10404; AVX2-NEXT:    vpor %xmm3, %xmm1, %xmm1
10405; AVX2-NEXT:    vpshufb %xmm2, %xmm11, %xmm2
10406; AVX2-NEXT:    vpshufb %xmm8, %xmm13, %xmm3
10407; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
10408; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10409; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
10410; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
10411; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10412; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10413; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10414; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
10415; AVX2-NEXT:    vpblendvb %ymm11, %ymm0, %ymm2, %ymm1
10416; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535]
10417; AVX2-NEXT:    vpblendvb %ymm5, %ymm0, %ymm2, %ymm12
10418; AVX2-NEXT:    vpblendvb %ymm15, %ymm2, %ymm0, %ymm13
10419; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
10420; AVX2-NEXT:    vpblendvb %ymm7, %ymm2, %ymm0, %ymm3
10421; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10422; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
10423; AVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
10424; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10425; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10426; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10427; AVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm3
10428; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10429; AVX2-NEXT:    vpblendvb %ymm11, %ymm0, %ymm2, %ymm3
10430; AVX2-NEXT:    vpblendvb %ymm5, %ymm0, %ymm2, %ymm14
10431; AVX2-NEXT:    vpblendvb %ymm15, %ymm2, %ymm0, %ymm4
10432; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10433; AVX2-NEXT:    vpblendvb %ymm7, %ymm2, %ymm0, %ymm0
10434; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10435; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10436; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
10437; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
10438; AVX2-NEXT:    vpblendvb %ymm10, %ymm6, %ymm2, %ymm0
10439; AVX2-NEXT:    vpblendvb %ymm11, %ymm6, %ymm2, %ymm4
10440; AVX2-NEXT:    vpblendvb %ymm5, %ymm6, %ymm2, %ymm9
10441; AVX2-NEXT:    vpblendvb %ymm15, %ymm2, %ymm6, %ymm8
10442; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10443; AVX2-NEXT:    vpblendvb %ymm7, %ymm2, %ymm6, %ymm2
10444; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10445; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10446; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
10447; AVX2-NEXT:    vpblendvb %ymm10, %ymm7, %ymm8, %ymm2
10448; AVX2-NEXT:    vpblendvb %ymm11, %ymm7, %ymm8, %ymm6
10449; AVX2-NEXT:    vpblendvb %ymm5, %ymm7, %ymm8, %ymm10
10450; AVX2-NEXT:    vpblendvb %ymm15, %ymm8, %ymm7, %ymm5
10451; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10452; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
10453; AVX2-NEXT:    vpblendvb %ymm5, %ymm8, %ymm7, %ymm11
10454; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u]
10455; AVX2-NEXT:    vpshufb %xmm8, %xmm1, %xmm15
10456; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
10457; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u]
10458; AVX2-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
10459; AVX2-NEXT:    vpor %xmm1, %xmm15, %xmm1
10460; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm15
10461; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8,9,10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15]
10462; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11]
10463; AVX2-NEXT:    # ymm5 = mem[0,1,0,1]
10464; AVX2-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
10465; AVX2-NEXT:    vpmovsxwq {{.*#+}} xmm15 = [18446744073709551615,255]
10466; AVX2-NEXT:    vpblendvb %ymm15, %ymm1, %ymm0, %ymm0
10467; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10468; AVX2-NEXT:    vpshufb %xmm8, %xmm3, %xmm0
10469; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm1
10470; AVX2-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
10471; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
10472; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm1
10473; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
10474; AVX2-NEXT:    vpshufb %ymm5, %ymm1, %ymm1
10475; AVX2-NEXT:    vpblendvb %ymm15, %ymm0, %ymm1, %ymm0
10476; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10477; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u]
10478; AVX2-NEXT:    vpshufb %xmm0, %xmm12, %xmm1
10479; AVX2-NEXT:    vextracti128 $1, %ymm12, %xmm2
10480; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u]
10481; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
10482; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
10483; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm2
10484; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5,6],ymm2[7,8],ymm4[9,10],ymm2[11],ymm4[12,13,14],ymm2[15]
10485; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12]
10486; AVX2-NEXT:    # ymm4 = mem[0,1,0,1]
10487; AVX2-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
10488; AVX2-NEXT:    vpblendvb %ymm15, %ymm1, %ymm2, %ymm12
10489; AVX2-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
10490; AVX2-NEXT:    vextracti128 $1, %ymm14, %xmm1
10491; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
10492; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
10493; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm1
10494; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4,5,6],ymm1[7,8],ymm6[9,10],ymm1[11],ymm6[12,13,14],ymm1[15]
10495; AVX2-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
10496; AVX2-NEXT:    vpblendvb %ymm15, %ymm0, %ymm1, %ymm14
10497; AVX2-NEXT:    vextracti128 $1, %ymm13, %xmm0
10498; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u]
10499; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
10500; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u]
10501; AVX2-NEXT:    vpshufb %xmm2, %xmm13, %xmm3
10502; AVX2-NEXT:    vpor %xmm0, %xmm3, %xmm0
10503; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm3
10504; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2,3],ymm3[4],ymm9[5,6],ymm3[7,8],ymm9[9,10,11],ymm3[12],ymm9[13,14],ymm3[15]
10505; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13]
10506; AVX2-NEXT:    # ymm4 = mem[0,1,0,1]
10507; AVX2-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
10508; AVX2-NEXT:    vpblendvb %ymm15, %ymm0, %ymm3, %ymm6
10509; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10510; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm0
10511; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
10512; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm1
10513; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
10514; AVX2-NEXT:    vextracti128 $1, %ymm10, %xmm1
10515; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm10[1,2,3],ymm1[4],ymm10[5,6],ymm1[7,8],ymm10[9,10,11],ymm1[12],ymm10[13,14],ymm1[15]
10516; AVX2-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
10517; AVX2-NEXT:    vpblendvb %ymm15, %ymm0, %ymm1, %ymm0
10518; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10519; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm1
10520; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u]
10521; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
10522; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u]
10523; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
10524; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
10525; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
10526; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm2
10527; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6,7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14,15]
10528; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14]
10529; AVX2-NEXT:    # ymm5 = mem[0,1,0,1]
10530; AVX2-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
10531; AVX2-NEXT:    vpblendvb %ymm15, %ymm1, %ymm2, %ymm2
10532; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10533; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm1
10534; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
10535; AVX2-NEXT:    vpshufb %xmm4, %xmm7, %xmm3
10536; AVX2-NEXT:    vpor %xmm1, %xmm3, %xmm1
10537; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10538; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm3
10539; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
10540; AVX2-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
10541; AVX2-NEXT:    vpblendvb %ymm15, %ymm1, %ymm3, %ymm1
10542; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u]
10543; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
10544; AVX2-NEXT:    vpshufb %xmm4, %xmm5, %xmm3
10545; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm5
10546; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u]
10547; AVX2-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
10548; AVX2-NEXT:    vpor %xmm3, %xmm5, %xmm3
10549; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
10550; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm5
10551; AVX2-NEXT:    vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7,8],ymm5[9],ymm8[10,11,12],ymm5[13],ymm8[14,15]
10552; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15]
10553; AVX2-NEXT:    # ymm8 = mem[0,1,0,1]
10554; AVX2-NEXT:    vpshufb %ymm8, %ymm5, %ymm5
10555; AVX2-NEXT:    vpblendvb %ymm15, %ymm3, %ymm5, %ymm3
10556; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
10557; AVX2-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
10558; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm5
10559; AVX2-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
10560; AVX2-NEXT:    vpor %xmm4, %xmm5, %xmm4
10561; AVX2-NEXT:    vextracti128 $1, %ymm11, %xmm5
10562; AVX2-NEXT:    vpblendw {{.*#+}} ymm5 = ymm11[0],ymm5[1],ymm11[2,3,4],ymm5[5],ymm11[6,7,8],ymm5[9],ymm11[10,11,12],ymm5[13],ymm11[14,15]
10563; AVX2-NEXT:    vpshufb %ymm8, %ymm5, %ymm5
10564; AVX2-NEXT:    vpblendvb %ymm15, %ymm4, %ymm5, %ymm4
10565; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
10566; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm5
10567; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,128,128,128,5,12,128,128,1,8,15,u,u,u,u]
10568; AVX2-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
10569; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,0,7,14,128,128,3,10,128,128,128,u,u,u,u]
10570; AVX2-NEXT:    vpshufb %xmm8, %xmm9, %xmm9
10571; AVX2-NEXT:    vpor %xmm5, %xmm9, %xmm5
10572; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
10573; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10574; AVX2-NEXT:    vpshufb %xmm13, %xmm9, %xmm9
10575; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13]
10576; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10577; AVX2-NEXT:    vpshufb %xmm10, %xmm11, %xmm11
10578; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
10579; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
10580; AVX2-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
10581; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7]
10582; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
10583; AVX2-NEXT:    vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1,2,3,4,5,6,7],ymm9[8],ymm5[9,10,11,12,13,14,15]
10584; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
10585; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
10586; AVX2-NEXT:    vextracti128 $1, %ymm11, %xmm9
10587; AVX2-NEXT:    vpshufb %xmm7, %xmm9, %xmm7
10588; AVX2-NEXT:    vpshufb %xmm8, %xmm11, %xmm8
10589; AVX2-NEXT:    vpor %xmm7, %xmm8, %xmm7
10590; AVX2-NEXT:    vmovdqa (%rsp), %xmm8 # 16-byte Reload
10591; AVX2-NEXT:    vpshufb %xmm13, %xmm8, %xmm8
10592; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10593; AVX2-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
10594; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
10595; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
10596; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
10597; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
10598; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
10599; AVX2-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
10600; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
10601; AVX2-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
10602; AVX2-NEXT:    # ymm8 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15]
10603; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7]
10604; AVX2-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload
10605; AVX2-NEXT:    # ymm9 = ymm14[0],mem[1,2,3,4,5,6,7],ymm14[8],mem[9,10,11,12,13,14,15]
10606; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7]
10607; AVX2-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload
10608; AVX2-NEXT:    # ymm10 = ymm6[0],mem[1,2,3,4,5,6,7],ymm6[8],mem[9,10,11,12,13,14,15]
10609; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7]
10610; AVX2-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
10611; AVX2-NEXT:    # ymm10 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15]
10612; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
10613; AVX2-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload
10614; AVX2-NEXT:    # ymm10 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15]
10615; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
10616; AVX2-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
10617; AVX2-NEXT:    # ymm10 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15]
10618; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
10619; AVX2-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload
10620; AVX2-NEXT:    # ymm10 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15]
10621; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
10622; AVX2-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload
10623; AVX2-NEXT:    # ymm10 = ymm4[0],mem[1,2,3,4,5,6,7],ymm4[8],mem[9,10,11,12,13,14,15]
10624; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
10625; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
10626; AVX2-NEXT:    vmovaps %ymm10, 32(%rsi)
10627; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
10628; AVX2-NEXT:    vmovaps %ymm10, (%rsi)
10629; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
10630; AVX2-NEXT:    vmovaps %ymm10, 32(%rdx)
10631; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
10632; AVX2-NEXT:    vmovaps %ymm10, (%rdx)
10633; AVX2-NEXT:    vmovdqa %ymm5, 32(%rcx)
10634; AVX2-NEXT:    vmovdqa %ymm7, (%rcx)
10635; AVX2-NEXT:    vmovdqa %ymm8, 32(%r8)
10636; AVX2-NEXT:    vmovdqa %ymm9, (%r8)
10637; AVX2-NEXT:    vmovdqa %ymm6, 32(%r9)
10638; AVX2-NEXT:    vmovdqa %ymm0, (%r9)
10639; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10640; AVX2-NEXT:    vmovdqa %ymm2, 32(%rax)
10641; AVX2-NEXT:    vmovdqa %ymm1, (%rax)
10642; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10643; AVX2-NEXT:    vmovdqa %ymm3, 32(%rax)
10644; AVX2-NEXT:    vmovdqa %ymm4, (%rax)
10645; AVX2-NEXT:    addq $760, %rsp # imm = 0x2F8
10646; AVX2-NEXT:    vzeroupper
10647; AVX2-NEXT:    retq
10648;
10649; AVX2-FP-LABEL: load_i8_stride7_vf64:
10650; AVX2-FP:       # %bb.0:
10651; AVX2-FP-NEXT:    subq $760, %rsp # imm = 0x2F8
10652; AVX2-FP-NEXT:    vmovdqa 320(%rdi), %ymm6
10653; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm7
10654; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm8
10655; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm12
10656; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm10
10657; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm11
10658; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm5
10659; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
10660; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm12, %ymm10, %ymm0
10661; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10662; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10663; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm2
10664; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u]
10665; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
10666; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u]
10667; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
10668; AVX2-FP-NEXT:    vpor %xmm3, %xmm0, %xmm0
10669; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
10670; AVX2-FP-NEXT:    vpblendvb %ymm14, %ymm5, %ymm11, %ymm3
10671; AVX2-FP-NEXT:    vmovdqa %ymm5, %ymm9
10672; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10673; AVX2-FP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10674; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm4
10675; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
10676; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9]
10677; AVX2-FP-NEXT:    # ymm3 = mem[0,1,0,1]
10678; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm4, %ymm5
10679; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0]
10680; AVX2-FP-NEXT:    vpblendvb %ymm4, %ymm0, %ymm5, %ymm0
10681; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10682; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm7, %ymm8, %ymm5
10683; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10684; AVX2-FP-NEXT:    vmovdqa %ymm7, %ymm0
10685; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10686; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm7
10687; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
10688; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %ymm15
10689; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
10690; AVX2-FP-NEXT:    vpor %xmm1, %xmm2, %xmm1
10691; AVX2-FP-NEXT:    vpblendvb %ymm14, %ymm6, %ymm15, %ymm2
10692; AVX2-FP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10693; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10694; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm5
10695; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15]
10696; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
10697; AVX2-FP-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
10698; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10699; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm11, %ymm9, %ymm2
10700; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm3
10701; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15]
10702; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
10703; AVX2-FP-NEXT:    vpblendvb %ymm1, %ymm12, %ymm10, %ymm5
10704; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm3
10705; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u]
10706; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm3, %xmm7
10707; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u]
10708; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
10709; AVX2-FP-NEXT:    vpor %xmm7, %xmm5, %xmm5
10710; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10]
10711; AVX2-FP-NEXT:    # ymm7 = mem[0,1,0,1]
10712; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
10713; AVX2-FP-NEXT:    vpmovsxdq {{.*#+}} ymm9 = [0,18446744073709551360,16777215,0]
10714; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm4, %ymm5, %ymm4
10715; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10716; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm15, %ymm6, %ymm4
10717; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm5
10718; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7,8,9],ymm5[10],ymm4[11,12,13],ymm5[14],ymm4[15]
10719; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
10720; AVX2-FP-NEXT:    vpblendvb %ymm1, %ymm0, %ymm8, %ymm5
10721; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm7
10722; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
10723; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
10724; AVX2-FP-NEXT:    vpor %xmm2, %xmm3, %xmm2
10725; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm4, %ymm2, %ymm0
10726; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10727; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm9
10728; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm15
10729; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
10730; AVX2-FP-NEXT:    vpblendvb %ymm4, %ymm9, %ymm15, %ymm2
10731; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10732; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm3
10733; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,128,128,3,10,128,128,128,6,13,u,u,u,u]
10734; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
10735; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,5,12,128,128,1,8,15,128,128,u,u,u,u]
10736; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
10737; AVX2-FP-NEXT:    vpor %xmm3, %xmm2, %xmm2
10738; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
10739; AVX2-FP-NEXT:    vmovdqa 208(%rdi), %xmm5
10740; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm5, %xmm3
10741; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
10742; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %xmm1
10743; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm1, %xmm10
10744; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10745; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3]
10746; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
10747; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
10748; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
10749; AVX2-FP-NEXT:    vpmovsxdq {{.*#+}} ymm14 = [18446744073709551615,18446744073709551615,16777215,0]
10750; AVX2-FP-NEXT:    vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
10751; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10752; AVX2-FP-NEXT:    vmovdqa 384(%rdi), %ymm2
10753; AVX2-FP-NEXT:    vmovdqa 352(%rdi), %ymm3
10754; AVX2-FP-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm0
10755; AVX2-FP-NEXT:    vmovdqa %ymm2, %ymm4
10756; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm6
10757; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm6, %xmm6
10758; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
10759; AVX2-FP-NEXT:    vpor %xmm6, %xmm0, %xmm0
10760; AVX2-FP-NEXT:    vmovdqa 432(%rdi), %xmm13
10761; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm13, %xmm6
10762; AVX2-FP-NEXT:    vmovdqa 416(%rdi), %xmm2
10763; AVX2-FP-NEXT:    vpshufb %xmm11, %xmm2, %xmm8
10764; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
10765; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
10766; AVX2-FP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
10767; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7]
10768; AVX2-FP-NEXT:    vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10769; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10770; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
10771; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm9, %ymm15, %ymm0
10772; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,u,6,13,128,128,2,9,128,128,128,u,u,u,u]
10773; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm0, %xmm6
10774; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm0
10775; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,128,128,4,11,128,128,0,7,14,u,u,u,u]
10776; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
10777; AVX2-FP-NEXT:    vpor %xmm6, %xmm0, %xmm0
10778; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
10779; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm5, %xmm9
10780; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12]
10781; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm1, %xmm11
10782; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
10783; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
10784; AVX2-FP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
10785; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7]
10786; AVX2-FP-NEXT:    vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10787; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10788; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm4, %ymm3, %ymm0
10789; AVX2-FP-NEXT:    vmovdqa %ymm3, %ymm11
10790; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm0, %xmm1
10791; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm0
10792; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
10793; AVX2-FP-NEXT:    vpor %xmm1, %xmm0, %xmm0
10794; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm13, %xmm1
10795; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm2, %xmm6
10796; AVX2-FP-NEXT:    vmovdqa %xmm2, %xmm12
10797; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
10798; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
10799; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10800; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
10801; AVX2-FP-NEXT:    vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10802; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10803; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
10804; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm3, %ymm4, %ymm0
10805; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10806; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm1
10807; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,128,128,128,6,13,128,128,2,9,u,u,u,u,u]
10808; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
10809; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,1,8,15,128,128,4,11,128,128,u,u,u,u,u]
10810; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
10811; AVX2-FP-NEXT:    vpor %xmm1, %xmm0, %xmm0
10812; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12]
10813; AVX2-FP-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10814; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm13, %xmm8
10815; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128]
10816; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm12, %xmm10
10817; AVX2-FP-NEXT:    vmovdqa %xmm12, %xmm3
10818; AVX2-FP-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10819; AVX2-FP-NEXT:    vpor %xmm8, %xmm10, %xmm8
10820; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm10
10821; AVX2-FP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
10822; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
10823; AVX2-FP-NEXT:    # ymm0 = mem[0,1,0,1]
10824; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm10, %ymm8, %ymm8
10825; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10826; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
10827; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm15, %ymm12, %ymm8
10828; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm10
10829; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm10, %xmm6
10830; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm8, %xmm7
10831; AVX2-FP-NEXT:    vpor %xmm6, %xmm7, %xmm6
10832; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
10833; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10834; AVX2-FP-NEXT:    vpshufb %xmm9, %xmm14, %xmm7
10835; AVX2-FP-NEXT:    vpor %xmm1, %xmm7, %xmm1
10836; AVX2-FP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
10837; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10838; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm6, %ymm1, %ymm1
10839; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10840; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
10841; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm11, %ymm4, %ymm1
10842; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,2,9,128,128,128,5,12,128,128,u,u,u,u,u]
10843; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
10844; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm1
10845; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u]
10846; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
10847; AVX2-FP-NEXT:    vpor %xmm7, %xmm1, %xmm1
10848; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13]
10849; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm13, %xmm9
10850; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128]
10851; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm3, %xmm11
10852; AVX2-FP-NEXT:    vpor %xmm9, %xmm11, %xmm9
10853; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10854; AVX2-FP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
10855; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm9, %ymm1
10856; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10857; AVX2-FP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10858; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm15, %ymm12, %ymm1
10859; AVX2-FP-NEXT:    vmovdqa %ymm12, %ymm2
10860; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
10861; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm1
10862; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
10863; AVX2-FP-NEXT:    vpor %xmm6, %xmm1, %xmm1
10864; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm5, %xmm6
10865; AVX2-FP-NEXT:    vmovdqa %xmm5, %xmm13
10866; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm14, %xmm7
10867; AVX2-FP-NEXT:    vpor %xmm6, %xmm7, %xmm6
10868; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10869; AVX2-FP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
10870; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm6, %ymm1
10871; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10872; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535]
10873; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10874; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm3, %ymm4, %ymm1
10875; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,u,3,10,128,128,128,6,13,128,128,u,u,u,u,u]
10876; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm1, %xmm7
10877; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm1
10878; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u]
10879; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
10880; AVX2-FP-NEXT:    vpor %xmm7, %xmm1, %xmm1
10881; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128]
10882; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
10883; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm12, %xmm9
10884; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14]
10885; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10886; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm14, %xmm11
10887; AVX2-FP-NEXT:    vpor %xmm9, %xmm11, %xmm9
10888; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10889; AVX2-FP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
10890; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm9, %ymm1
10891; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10892; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm15, %ymm2, %ymm1
10893; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
10894; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm1
10895; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
10896; AVX2-FP-NEXT:    vpor %xmm6, %xmm1, %xmm1
10897; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10898; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm11, %xmm6
10899; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm13, %xmm7
10900; AVX2-FP-NEXT:    vmovdqa %xmm13, (%rsp) # 16-byte Spill
10901; AVX2-FP-NEXT:    vpor %xmm6, %xmm7, %xmm6
10902; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10903; AVX2-FP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
10904; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm6, %ymm1
10905; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10906; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
10907; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm3, %ymm4, %ymm2
10908; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10909; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
10910; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm4, %ymm3, %ymm1
10911; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm2
10912; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,128,128,2,9,128,128,128,5,12,u,u,u,u,u]
10913; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
10914; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,4,11,128,128,0,7,14,128,128,u,u,u,u,u]
10915; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
10916; AVX2-FP-NEXT:    vpor %xmm2, %xmm1, %xmm1
10917; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128]
10918; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm12, %xmm7
10919; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15]
10920; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm14, %xmm9
10921; AVX2-FP-NEXT:    vpor %xmm7, %xmm9, %xmm7
10922; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10923; AVX2-FP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
10924; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm7, %ymm1
10925; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10926; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10927; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10928; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm4, %ymm1, %ymm5
10929; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10930; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm1, %ymm4, %ymm1
10931; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm4
10932; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
10933; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
10934; AVX2-FP-NEXT:    vpor %xmm3, %xmm1, %xmm1
10935; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm11, %xmm2
10936; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm13, %xmm3
10937; AVX2-FP-NEXT:    vpor %xmm2, %xmm3, %xmm2
10938; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
10939; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
10940; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
10941; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10942; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10943; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10944; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
10945; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm0, %ymm2, %ymm1
10946; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535]
10947; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm0, %ymm2, %ymm12
10948; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm2, %ymm0, %ymm13
10949; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
10950; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm2, %ymm0, %ymm3
10951; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10952; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
10953; AVX2-FP-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
10954; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10955; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10956; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10957; AVX2-FP-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm3
10958; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10959; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm0, %ymm2, %ymm3
10960; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm0, %ymm2, %ymm14
10961; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm2, %ymm0, %ymm4
10962; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10963; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm2, %ymm0, %ymm0
10964; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10965; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10966; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
10967; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
10968; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm6, %ymm2, %ymm0
10969; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm6, %ymm2, %ymm4
10970; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm6, %ymm2, %ymm9
10971; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm2, %ymm6, %ymm8
10972; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10973; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm2, %ymm6, %ymm2
10974; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10975; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10976; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
10977; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm7, %ymm8, %ymm2
10978; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm7, %ymm8, %ymm6
10979; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm7, %ymm8, %ymm10
10980; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm8, %ymm7, %ymm5
10981; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10982; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
10983; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm8, %ymm7, %ymm11
10984; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm8 = [2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u]
10985; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm1, %xmm15
10986; AVX2-FP-NEXT:    vextracti128 $1, %ymm1, %xmm1
10987; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u]
10988; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
10989; AVX2-FP-NEXT:    vpor %xmm1, %xmm15, %xmm1
10990; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm15
10991; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8,9,10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15]
10992; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11]
10993; AVX2-FP-NEXT:    # ymm5 = mem[0,1,0,1]
10994; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
10995; AVX2-FP-NEXT:    vpmovsxwq {{.*#+}} xmm15 = [18446744073709551615,255]
10996; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm1, %ymm0, %ymm0
10997; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10998; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm3, %xmm0
10999; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm1
11000; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
11001; AVX2-FP-NEXT:    vpor %xmm0, %xmm1, %xmm0
11002; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm1
11003; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
11004; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm1, %ymm1
11005; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm0, %ymm1, %ymm0
11006; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11007; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u]
11008; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm12, %xmm1
11009; AVX2-FP-NEXT:    vextracti128 $1, %ymm12, %xmm2
11010; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u]
11011; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
11012; AVX2-FP-NEXT:    vpor %xmm1, %xmm2, %xmm1
11013; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm2
11014; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5,6],ymm2[7,8],ymm4[9,10],ymm2[11],ymm4[12,13,14],ymm2[15]
11015; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12]
11016; AVX2-FP-NEXT:    # ymm4 = mem[0,1,0,1]
11017; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
11018; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm1, %ymm2, %ymm12
11019; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
11020; AVX2-FP-NEXT:    vextracti128 $1, %ymm14, %xmm1
11021; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
11022; AVX2-FP-NEXT:    vpor %xmm0, %xmm1, %xmm0
11023; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm1
11024; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4,5,6],ymm1[7,8],ymm6[9,10],ymm1[11],ymm6[12,13,14],ymm1[15]
11025; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
11026; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm0, %ymm1, %ymm14
11027; AVX2-FP-NEXT:    vextracti128 $1, %ymm13, %xmm0
11028; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u]
11029; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
11030; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u]
11031; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm13, %xmm3
11032; AVX2-FP-NEXT:    vpor %xmm0, %xmm3, %xmm0
11033; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm3
11034; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2,3],ymm3[4],ymm9[5,6],ymm3[7,8],ymm9[9,10,11],ymm3[12],ymm9[13,14],ymm3[15]
11035; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13]
11036; AVX2-FP-NEXT:    # ymm4 = mem[0,1,0,1]
11037; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
11038; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm0, %ymm3, %ymm6
11039; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11040; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm0
11041; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
11042; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm3, %xmm1
11043; AVX2-FP-NEXT:    vpor %xmm0, %xmm1, %xmm0
11044; AVX2-FP-NEXT:    vextracti128 $1, %ymm10, %xmm1
11045; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm10[1,2,3],ymm1[4],ymm10[5,6],ymm1[7,8],ymm10[9,10,11],ymm1[12],ymm10[13,14],ymm1[15]
11046; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
11047; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm0, %ymm1, %ymm0
11048; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11049; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm1
11050; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u]
11051; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
11052; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm4 = [5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u]
11053; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
11054; AVX2-FP-NEXT:    vpor %xmm1, %xmm2, %xmm1
11055; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11056; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm2
11057; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6,7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14,15]
11058; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14]
11059; AVX2-FP-NEXT:    # ymm5 = mem[0,1,0,1]
11060; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
11061; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm1, %ymm2, %ymm2
11062; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11063; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm1
11064; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
11065; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm7, %xmm3
11066; AVX2-FP-NEXT:    vpor %xmm1, %xmm3, %xmm1
11067; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11068; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm3
11069; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
11070; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
11071; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm1, %ymm3, %ymm1
11072; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm4 = [6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u]
11073; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11074; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm5, %xmm3
11075; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm5
11076; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u]
11077; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
11078; AVX2-FP-NEXT:    vpor %xmm3, %xmm5, %xmm3
11079; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
11080; AVX2-FP-NEXT:    vextracti128 $1, %ymm8, %xmm5
11081; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7,8],ymm5[9],ymm8[10,11,12],ymm5[13],ymm8[14,15]
11082; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15]
11083; AVX2-FP-NEXT:    # ymm8 = mem[0,1,0,1]
11084; AVX2-FP-NEXT:    vpshufb %ymm8, %ymm5, %ymm5
11085; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm3, %ymm5, %ymm3
11086; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11087; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
11088; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm5
11089; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
11090; AVX2-FP-NEXT:    vpor %xmm4, %xmm5, %xmm4
11091; AVX2-FP-NEXT:    vextracti128 $1, %ymm11, %xmm5
11092; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm11[0],ymm5[1],ymm11[2,3,4],ymm5[5],ymm11[6,7,8],ymm5[9],ymm11[10,11,12],ymm5[13],ymm11[14,15]
11093; AVX2-FP-NEXT:    vpshufb %ymm8, %ymm5, %ymm5
11094; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm4, %ymm5, %ymm4
11095; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11096; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm5
11097; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,128,128,128,5,12,128,128,1,8,15,u,u,u,u]
11098; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
11099; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,0,7,14,128,128,3,10,128,128,128,u,u,u,u]
11100; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm9, %xmm9
11101; AVX2-FP-NEXT:    vpor %xmm5, %xmm9, %xmm5
11102; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
11103; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
11104; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm9, %xmm9
11105; AVX2-FP-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13]
11106; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
11107; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm11, %xmm11
11108; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
11109; AVX2-FP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
11110; AVX2-FP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
11111; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7]
11112; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11113; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1,2,3,4,5,6,7],ymm9[8],ymm5[9,10,11,12,13,14,15]
11114; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
11115; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
11116; AVX2-FP-NEXT:    vextracti128 $1, %ymm11, %xmm9
11117; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm9, %xmm7
11118; AVX2-FP-NEXT:    vpshufb %xmm8, %xmm11, %xmm8
11119; AVX2-FP-NEXT:    vpor %xmm7, %xmm8, %xmm7
11120; AVX2-FP-NEXT:    vmovdqa (%rsp), %xmm8 # 16-byte Reload
11121; AVX2-FP-NEXT:    vpshufb %xmm13, %xmm8, %xmm8
11122; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
11123; AVX2-FP-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
11124; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
11125; AVX2-FP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
11126; AVX2-FP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
11127; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
11128; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
11129; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
11130; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
11131; AVX2-FP-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
11132; AVX2-FP-NEXT:    # ymm8 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15]
11133; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7]
11134; AVX2-FP-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload
11135; AVX2-FP-NEXT:    # ymm9 = ymm14[0],mem[1,2,3,4,5,6,7],ymm14[8],mem[9,10,11,12,13,14,15]
11136; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7]
11137; AVX2-FP-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload
11138; AVX2-FP-NEXT:    # ymm10 = ymm6[0],mem[1,2,3,4,5,6,7],ymm6[8],mem[9,10,11,12,13,14,15]
11139; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7]
11140; AVX2-FP-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
11141; AVX2-FP-NEXT:    # ymm10 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15]
11142; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
11143; AVX2-FP-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload
11144; AVX2-FP-NEXT:    # ymm10 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15]
11145; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
11146; AVX2-FP-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
11147; AVX2-FP-NEXT:    # ymm10 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15]
11148; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
11149; AVX2-FP-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload
11150; AVX2-FP-NEXT:    # ymm10 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15]
11151; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
11152; AVX2-FP-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload
11153; AVX2-FP-NEXT:    # ymm10 = ymm4[0],mem[1,2,3,4,5,6,7],ymm4[8],mem[9,10,11,12,13,14,15]
11154; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
11155; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11156; AVX2-FP-NEXT:    vmovaps %ymm10, 32(%rsi)
11157; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11158; AVX2-FP-NEXT:    vmovaps %ymm10, (%rsi)
11159; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11160; AVX2-FP-NEXT:    vmovaps %ymm10, 32(%rdx)
11161; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11162; AVX2-FP-NEXT:    vmovaps %ymm10, (%rdx)
11163; AVX2-FP-NEXT:    vmovdqa %ymm5, 32(%rcx)
11164; AVX2-FP-NEXT:    vmovdqa %ymm7, (%rcx)
11165; AVX2-FP-NEXT:    vmovdqa %ymm8, 32(%r8)
11166; AVX2-FP-NEXT:    vmovdqa %ymm9, (%r8)
11167; AVX2-FP-NEXT:    vmovdqa %ymm6, 32(%r9)
11168; AVX2-FP-NEXT:    vmovdqa %ymm0, (%r9)
11169; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
11170; AVX2-FP-NEXT:    vmovdqa %ymm2, 32(%rax)
11171; AVX2-FP-NEXT:    vmovdqa %ymm1, (%rax)
11172; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
11173; AVX2-FP-NEXT:    vmovdqa %ymm3, 32(%rax)
11174; AVX2-FP-NEXT:    vmovdqa %ymm4, (%rax)
11175; AVX2-FP-NEXT:    addq $760, %rsp # imm = 0x2F8
11176; AVX2-FP-NEXT:    vzeroupper
11177; AVX2-FP-NEXT:    retq
11178;
11179; AVX2-FCP-LABEL: load_i8_stride7_vf64:
11180; AVX2-FCP:       # %bb.0:
11181; AVX2-FCP-NEXT:    subq $776, %rsp # imm = 0x308
11182; AVX2-FCP-NEXT:    vmovdqa 320(%rdi), %ymm15
11183; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm6
11184; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm10
11185; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm13
11186; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm12
11187; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm11
11188; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm9
11189; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
11190; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm13, %ymm12, %ymm0
11191; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11192; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11193; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
11194; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u]
11195; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
11196; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u]
11197; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
11198; AVX2-FCP-NEXT:    vpor %xmm3, %xmm0, %xmm0
11199; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
11200; AVX2-FCP-NEXT:    vpblendvb %ymm14, %ymm9, %ymm11, %ymm3
11201; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11202; AVX2-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11203; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
11204; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
11205; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9]
11206; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
11207; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm5
11208; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0]
11209; AVX2-FCP-NEXT:    vpblendvb %ymm4, %ymm0, %ymm5, %ymm0
11210; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11211; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm6, %ymm10, %ymm5
11212; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11213; AVX2-FCP-NEXT:    vmovdqa %ymm6, %ymm7
11214; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11215; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
11216; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
11217; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm8
11218; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
11219; AVX2-FCP-NEXT:    vpor %xmm1, %xmm2, %xmm1
11220; AVX2-FCP-NEXT:    vmovdqa %ymm15, %ymm0
11221; AVX2-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11222; AVX2-FCP-NEXT:    vpblendvb %ymm14, %ymm15, %ymm8, %ymm2
11223; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11224; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm5
11225; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15]
11226; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
11227; AVX2-FCP-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
11228; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11229; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
11230; AVX2-FCP-NEXT:    vpblendvb %ymm15, %ymm11, %ymm9, %ymm2
11231; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
11232; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15]
11233; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
11234; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm13, %ymm12, %ymm4
11235; AVX2-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
11236; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u]
11237; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
11238; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u]
11239; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
11240; AVX2-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
11241; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10]
11242; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
11243; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
11244; AVX2-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm1 = [0,18446744073709551360,16777215,0]
11245; AVX2-FCP-NEXT:    vpblendvb %ymm1, %ymm3, %ymm4, %ymm3
11246; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11247; AVX2-FCP-NEXT:    vpblendvb %ymm15, %ymm8, %ymm0, %ymm3
11248; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm8
11249; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4,5],ymm8[6],ymm3[7,8,9],ymm8[10],ymm3[11,12,13],ymm8[14],ymm3[15]
11250; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
11251; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm7, %ymm10, %ymm5
11252; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm8
11253; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm8, %xmm2
11254; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
11255; AVX2-FCP-NEXT:    vpor %xmm2, %xmm5, %xmm2
11256; AVX2-FCP-NEXT:    vpblendvb %ymm1, %ymm3, %ymm2, %ymm5
11257; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm10
11258; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm12
11259; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
11260; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm10, %ymm12, %ymm2
11261; AVX2-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11262; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11263; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
11264; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,128,128,3,10,128,128,128,6,13,u,u,u,u]
11265; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
11266; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,u,5,12,128,128,1,8,15,128,128,u,u,u,u]
11267; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
11268; AVX2-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
11269; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
11270; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm8
11271; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,2,0,2,1,2,4,6]
11272; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm4, %ymm3
11273; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11274; AVX2-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13]
11275; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
11276; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
11277; AVX2-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm6 = [18446744073709551615,18446744073709551615,16777215,0]
11278; AVX2-FCP-NEXT:    vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
11279; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11280; AVX2-FCP-NEXT:    vmovdqa 384(%rdi), %ymm2
11281; AVX2-FCP-NEXT:    vmovdqa 352(%rdi), %ymm14
11282; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm2, %ymm14, %ymm0
11283; AVX2-FCP-NEXT:    vmovdqa %ymm2, %ymm3
11284; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm13
11285; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm13, %xmm7
11286; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
11287; AVX2-FCP-NEXT:    vpor %xmm7, %xmm0, %xmm0
11288; AVX2-FCP-NEXT:    vmovdqa 416(%rdi), %ymm9
11289; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm4, %ymm4
11290; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11291; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
11292; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
11293; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
11294; AVX2-FCP-NEXT:    vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11295; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11296; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
11297; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm10, %ymm12, %ymm0
11298; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,6,13,128,128,2,9,128,128,128,u,u,u,u]
11299; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm4
11300; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
11301; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,128,128,4,11,128,128,0,7,14,u,u,u,u]
11302; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
11303; AVX2-FCP-NEXT:    vpor %xmm4, %xmm0, %xmm0
11304; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
11305; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,3,0,2,1,3,4,6]
11306; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm4, %ymm11
11307; AVX2-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm12 = [1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14]
11308; AVX2-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm11
11309; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7]
11310; AVX2-FCP-NEXT:    vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11311; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11312; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm2, %ymm14, %ymm0
11313; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
11314; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
11315; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
11316; AVX2-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
11317; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm4, %ymm1
11318; AVX2-FCP-NEXT:    vpshufb %ymm12, %ymm1, %ymm1
11319; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
11320; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
11321; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm5, %ymm0, %ymm0
11322; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11323; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
11324; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm14, %ymm2, %ymm0
11325; AVX2-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11326; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
11327; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,128,128,128,6,13,128,128,2,9,u,u,u,u,u]
11328; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
11329; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,1,8,15,128,128,4,11,128,128,u,u,u,u,u]
11330; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
11331; AVX2-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
11332; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12]
11333; AVX2-FCP-NEXT:    vmovdqa 432(%rdi), %xmm13
11334; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm13, %xmm1
11335; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm11 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128]
11336; AVX2-FCP-NEXT:    vmovdqa 416(%rdi), %xmm15
11337; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm15, %xmm12
11338; AVX2-FCP-NEXT:    vpor %xmm1, %xmm12, %xmm1
11339; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm12
11340; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
11341; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
11342; AVX2-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
11343; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm12, %ymm1, %ymm1
11344; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11345; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11346; AVX2-FCP-NEXT:    vpblendvb %ymm7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
11347; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm12
11348; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm12, %xmm4
11349; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
11350; AVX2-FCP-NEXT:    vpor %xmm4, %xmm1, %xmm6
11351; AVX2-FCP-NEXT:    vmovdqa 208(%rdi), %xmm2
11352; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm2, %xmm10
11353; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11354; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %xmm8
11355; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm11
11356; AVX2-FCP-NEXT:    vpor %xmm10, %xmm11, %xmm10
11357; AVX2-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
11358; AVX2-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
11359; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm6, %ymm10, %ymm1
11360; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11361; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
11362; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm14, %ymm3, %ymm6
11363; AVX2-FCP-NEXT:    vmovdqa %ymm3, %ymm14
11364; AVX2-FCP-NEXT:    vmovdqa %ymm0, %ymm3
11365; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,2,9,128,128,128,5,12,128,128,u,u,u,u,u]
11366; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm6, %xmm11
11367; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
11368; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u]
11369; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm6, %xmm6
11370; AVX2-FCP-NEXT:    vpor %xmm6, %xmm11, %xmm6
11371; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm11 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13]
11372; AVX2-FCP-NEXT:    vmovdqa %xmm13, %xmm5
11373; AVX2-FCP-NEXT:    vmovdqa %xmm13, (%rsp) # 16-byte Spill
11374; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm13, %xmm13
11375; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128]
11376; AVX2-FCP-NEXT:    vmovdqa %xmm15, %xmm7
11377; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm15, %xmm15
11378; AVX2-FCP-NEXT:    vpor %xmm13, %xmm15, %xmm13
11379; AVX2-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
11380; AVX2-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
11381; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm6, %ymm13, %ymm0
11382; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11383; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11384; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11385; AVX2-FCP-NEXT:    vpblendvb %ymm3, %ymm4, %ymm0, %ymm6
11386; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm6, %xmm10
11387; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm6
11388; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm6, %xmm6
11389; AVX2-FCP-NEXT:    vpor %xmm6, %xmm10, %xmm6
11390; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm2, %xmm10
11391; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
11392; AVX2-FCP-NEXT:    vpor %xmm1, %xmm10, %xmm1
11393; AVX2-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
11394; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
11395; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm6, %ymm1, %ymm1
11396; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11397; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535]
11398; AVX2-FCP-NEXT:    vmovdqa %ymm14, %ymm3
11399; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11400; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm2, %ymm14, %ymm1
11401; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [u,u,3,10,128,128,128,6,13,128,128,u,u,u,u,u]
11402; AVX2-FCP-NEXT:    vpshufb %xmm14, %xmm1, %xmm10
11403; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
11404; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u]
11405; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
11406; AVX2-FCP-NEXT:    vpor %xmm1, %xmm10, %xmm1
11407; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128]
11408; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm7, %xmm13
11409; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm15 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14]
11410; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm5, %xmm11
11411; AVX2-FCP-NEXT:    vpor %xmm13, %xmm11, %xmm11
11412; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
11413; AVX2-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
11414; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm1, %ymm11, %ymm1
11415; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11416; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm4, %ymm0, %ymm1
11417; AVX2-FCP-NEXT:    vmovdqa %ymm0, %ymm13
11418; AVX2-FCP-NEXT:    vpshufb %xmm14, %xmm1, %xmm6
11419; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
11420; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
11421; AVX2-FCP-NEXT:    vpor %xmm6, %xmm1, %xmm1
11422; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm8, %xmm6
11423; AVX2-FCP-NEXT:    vmovdqa %xmm8, %xmm14
11424; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11425; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm0, %xmm10
11426; AVX2-FCP-NEXT:    vpor %xmm6, %xmm10, %xmm6
11427; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
11428; AVX2-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
11429; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm1, %ymm6, %ymm0
11430; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11431; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
11432; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm2, %ymm3, %ymm1
11433; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11434; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
11435; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm3, %ymm2, %ymm1
11436; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
11437; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,128,128,2,9,128,128,128,5,12,u,u,u,u,u]
11438; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
11439; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,4,11,128,128,0,7,14,128,128,u,u,u,u,u]
11440; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
11441; AVX2-FCP-NEXT:    vpor %xmm2, %xmm1, %xmm1
11442; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128]
11443; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm7, %xmm5
11444; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm11 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15]
11445; AVX2-FCP-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
11446; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm0, %xmm8
11447; AVX2-FCP-NEXT:    vpor %xmm5, %xmm8, %xmm5
11448; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
11449; AVX2-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
11450; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm1, %ymm5, %ymm0
11451; AVX2-FCP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
11452; AVX2-FCP-NEXT:    vmovdqa %ymm4, %ymm0
11453; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm4, %ymm13, %ymm4
11454; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11455; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm13, %ymm0, %ymm1
11456; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm5
11457; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
11458; AVX2-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
11459; AVX2-FCP-NEXT:    vpor %xmm3, %xmm1, %xmm1
11460; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm14, %xmm2
11461; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11462; AVX2-FCP-NEXT:    vpshufb %xmm11, %xmm0, %xmm3
11463; AVX2-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
11464; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
11465; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
11466; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm1, %ymm2, %ymm0
11467; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11468; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11469; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11470; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
11471; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm0, %ymm1, %ymm11
11472; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm15 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535]
11473; AVX2-FCP-NEXT:    vpblendvb %ymm15, %ymm0, %ymm1, %ymm9
11474; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm1, %ymm0, %ymm10
11475; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
11476; AVX2-FCP-NEXT:    vpblendvb %ymm5, %ymm1, %ymm0, %ymm2
11477; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11478; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
11479; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
11480; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11481; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11482; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11483; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm2
11484; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11485; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm0, %ymm1, %ymm3
11486; AVX2-FCP-NEXT:    vpblendvb %ymm15, %ymm0, %ymm1, %ymm8
11487; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm1, %ymm0, %ymm2
11488; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11489; AVX2-FCP-NEXT:    vpblendvb %ymm5, %ymm1, %ymm0, %ymm0
11490; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11491; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11492; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11493; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
11494; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm2, %ymm0, %ymm1
11495; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm2, %ymm0, %ymm4
11496; AVX2-FCP-NEXT:    vpblendvb %ymm15, %ymm2, %ymm0, %ymm14
11497; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm0, %ymm2, %ymm13
11498; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11499; AVX2-FCP-NEXT:    vpblendvb %ymm5, %ymm0, %ymm2, %ymm0
11500; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11501; AVX2-FCP-NEXT:    vmovdqa %ymm5, %ymm13
11502; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11503; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11504; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm0, %ymm5, %ymm2
11505; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm0, %ymm5, %ymm12
11506; AVX2-FCP-NEXT:    vpblendvb %ymm15, %ymm0, %ymm5, %ymm15
11507; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm5, %ymm0, %ymm6
11508; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11509; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm5, %ymm0, %ymm0
11510; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11511; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u]
11512; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm11, %xmm0
11513; AVX2-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
11514; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u]
11515; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm11, %xmm11
11516; AVX2-FCP-NEXT:    vpor %xmm0, %xmm11, %xmm0
11517; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm11
11518; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7,8,9,10],ymm11[11],ymm1[12,13],ymm11[14],ymm1[15]
11519; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11]
11520; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
11521; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm1
11522; AVX2-FCP-NEXT:    vpmovsxwq {{.*#+}} xmm11 = [18446744073709551615,255]
11523; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm0, %ymm1, %ymm0
11524; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11525; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm0
11526; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm1
11527; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
11528; AVX2-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
11529; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm1
11530; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
11531; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm1
11532; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm0, %ymm1, %ymm0
11533; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11534; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u]
11535; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm9, %xmm1
11536; AVX2-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm2
11537; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u]
11538; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
11539; AVX2-FCP-NEXT:    vpor %xmm1, %xmm2, %xmm1
11540; AVX2-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm2
11541; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5,6],ymm2[7,8],ymm4[9,10],ymm2[11],ymm4[12,13,14],ymm2[15]
11542; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12]
11543; AVX2-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
11544; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
11545; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm1, %ymm2, %ymm9
11546; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm8, %xmm0
11547; AVX2-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm1
11548; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
11549; AVX2-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
11550; AVX2-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm1
11551; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2],ymm1[3],ymm12[4,5,6],ymm1[7,8],ymm12[9,10],ymm1[11],ymm12[12,13,14],ymm1[15]
11552; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
11553; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm0, %ymm1, %ymm12
11554; AVX2-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm0
11555; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u]
11556; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
11557; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u]
11558; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm10, %xmm3
11559; AVX2-FCP-NEXT:    vpor %xmm0, %xmm3, %xmm0
11560; AVX2-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm3
11561; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1,2,3],ymm3[4],ymm14[5,6],ymm3[7,8],ymm14[9,10,11],ymm3[12],ymm14[13,14],ymm3[15]
11562; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13]
11563; AVX2-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
11564; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
11565; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm0, %ymm3, %ymm14
11566; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11567; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm0
11568; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
11569; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm3, %xmm1
11570; AVX2-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
11571; AVX2-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm1
11572; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm15[1,2,3],ymm1[4],ymm15[5,6],ymm1[7,8],ymm15[9,10,11],ymm1[12],ymm15[13,14],ymm1[15]
11573; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
11574; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm0, %ymm1, %ymm15
11575; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11576; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm0
11577; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u]
11578; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
11579; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u]
11580; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
11581; AVX2-FCP-NEXT:    vpor %xmm0, %xmm3, %xmm0
11582; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11583; AVX2-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm3
11584; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
11585; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14]
11586; AVX2-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
11587; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
11588; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm0, %ymm3, %ymm0
11589; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11590; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm3
11591; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
11592; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
11593; AVX2-FCP-NEXT:    vpor %xmm1, %xmm2, %xmm1
11594; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11595; AVX2-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm2
11596; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
11597; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
11598; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm1, %ymm2, %ymm1
11599; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u]
11600; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11601; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm4, %xmm2
11602; AVX2-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
11603; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u]
11604; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
11605; AVX2-FCP-NEXT:    vpor %xmm2, %xmm4, %xmm2
11606; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11607; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm4
11608; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7,8],ymm4[9],ymm6[10,11,12],ymm4[13],ymm6[14,15]
11609; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15]
11610; AVX2-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
11611; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm4
11612; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm2, %ymm4, %ymm2
11613; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11614; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
11615; AVX2-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
11616; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
11617; AVX2-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
11618; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11619; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm4
11620; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15]
11621; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm4
11622; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm3, %ymm4, %ymm3
11623; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11624; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm4
11625; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,u,128,128,128,5,12,128,128,1,8,15,u,u,u,u]
11626; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
11627; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,0,7,14,128,128,3,10,128,128,128,u,u,u,u]
11628; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
11629; AVX2-FCP-NEXT:    vpor %xmm4, %xmm7, %xmm4
11630; AVX2-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
11631; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,3,1,2,1,3,5,6]
11632; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload
11633; AVX2-FCP-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15]
11634; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm8, %ymm8
11635; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7]
11636; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
11637; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm8[0],ymm4[1,2,3,4,5,6,7],ymm8[8],ymm4[9,10,11,12,13,14,15]
11638; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
11639; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
11640; AVX2-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm8
11641; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm8, %xmm5
11642; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm11, %xmm6
11643; AVX2-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
11644; AVX2-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
11645; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm6, %ymm6
11646; AVX2-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
11647; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
11648; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11649; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15]
11650; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
11651; AVX2-FCP-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload
11652; AVX2-FCP-NEXT:    # ymm6 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15]
11653; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
11654; AVX2-FCP-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload
11655; AVX2-FCP-NEXT:    # ymm7 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15]
11656; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm7[4,5,6,7]
11657; AVX2-FCP-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload
11658; AVX2-FCP-NEXT:    # ymm8 = ymm14[0],mem[1,2,3,4,5,6,7],ymm14[8],mem[9,10,11,12,13,14,15]
11659; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
11660; AVX2-FCP-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload
11661; AVX2-FCP-NEXT:    # ymm9 = ymm15[0],mem[1,2,3,4,5,6,7],ymm15[8],mem[9,10,11,12,13,14,15]
11662; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7]
11663; AVX2-FCP-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
11664; AVX2-FCP-NEXT:    # ymm10 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15]
11665; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
11666; AVX2-FCP-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
11667; AVX2-FCP-NEXT:    # ymm10 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15]
11668; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
11669; AVX2-FCP-NEXT:    vpblendw $254, (%rsp), %ymm2, %ymm10 # 32-byte Folded Reload
11670; AVX2-FCP-NEXT:    # ymm10 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15]
11671; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
11672; AVX2-FCP-NEXT:    vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload
11673; AVX2-FCP-NEXT:    # ymm10 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15]
11674; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
11675; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11676; AVX2-FCP-NEXT:    vmovaps %ymm10, 32(%rsi)
11677; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11678; AVX2-FCP-NEXT:    vmovaps %ymm10, (%rsi)
11679; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11680; AVX2-FCP-NEXT:    vmovaps %ymm10, 32(%rdx)
11681; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11682; AVX2-FCP-NEXT:    vmovaps %ymm10, (%rdx)
11683; AVX2-FCP-NEXT:    vmovdqa %ymm4, 32(%rcx)
11684; AVX2-FCP-NEXT:    vmovdqa %ymm5, (%rcx)
11685; AVX2-FCP-NEXT:    vmovdqa %ymm6, 32(%r8)
11686; AVX2-FCP-NEXT:    vmovdqa %ymm7, (%r8)
11687; AVX2-FCP-NEXT:    vmovdqa %ymm8, 32(%r9)
11688; AVX2-FCP-NEXT:    vmovdqa %ymm9, (%r9)
11689; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
11690; AVX2-FCP-NEXT:    vmovdqa %ymm0, 32(%rax)
11691; AVX2-FCP-NEXT:    vmovdqa %ymm1, (%rax)
11692; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
11693; AVX2-FCP-NEXT:    vmovdqa %ymm2, 32(%rax)
11694; AVX2-FCP-NEXT:    vmovdqa %ymm3, (%rax)
11695; AVX2-FCP-NEXT:    addq $776, %rsp # imm = 0x308
11696; AVX2-FCP-NEXT:    vzeroupper
11697; AVX2-FCP-NEXT:    retq
11698;
11699; AVX512-LABEL: load_i8_stride7_vf64:
11700; AVX512:       # %bb.0:
11701; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
11702; AVX512-NEXT:    vmovdqa (%rdi), %ymm12
11703; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm13
11704; AVX512-NEXT:    vmovdqa64 64(%rdi), %ymm31
11705; AVX512-NEXT:    vmovdqa %ymm0, %ymm1
11706; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm24
11707; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm13 ^ (ymm1 & (ymm12 ^ ymm13))
11708; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
11709; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u]
11710; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
11711; AVX512-NEXT:    vpor %xmm2, %xmm1, %xmm1
11712; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
11713; AVX512-NEXT:    vmovdqa64 96(%rdi), %ymm19
11714; AVX512-NEXT:    vmovdqa %ymm9, %ymm2
11715; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm31 ^ (ymm2 & (ymm19 ^ ymm31))
11716; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm11
11717; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm11[2],ymm2[3,4],ymm11[5],ymm2[6,7,8,9],ymm11[10],ymm2[11,12],ymm11[13],ymm2[14,15]
11718; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
11719; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm1 & mem)
11720; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
11721; AVX512-NEXT:    vmovdqa64 128(%rdi), %ymm21
11722; AVX512-NEXT:    vmovdqa64 160(%rdi), %ymm29
11723; AVX512-NEXT:    vmovdqa %ymm14, %ymm1
11724; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm21 ^ (ymm1 & (ymm29 ^ ymm21))
11725; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm3
11726; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u]
11727; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
11728; AVX512-NEXT:    vpor %xmm3, %xmm1, %xmm1
11729; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
11730; AVX512-NEXT:    vmovdqa 192(%rdi), %xmm0
11731; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
11732; AVX512-NEXT:    vpshufb %xmm4, %xmm0, %xmm3
11733; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm27
11734; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm20
11735; AVX512-NEXT:    vmovdqa 208(%rdi), %xmm10
11736; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm10[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
11737; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
11738; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
11739; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
11740; AVX512-NEXT:    vmovdqa 240(%rdi), %xmm3
11741; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u,u,u,u,u]
11742; AVX512-NEXT:    vmovdqa 224(%rdi), %xmm6
11743; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u]
11744; AVX512-NEXT:    vpor %xmm5, %xmm7, %xmm5
11745; AVX512-NEXT:    vinserti32x4 $2, %xmm5, %zmm1, %zmm22
11746; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
11747; AVX512-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm4 & (zmm22 ^ zmm2))
11748; AVX512-NEXT:    vmovdqa64 288(%rdi), %ymm18
11749; AVX512-NEXT:    vmovdqa64 256(%rdi), %ymm16
11750; AVX512-NEXT:    vmovdqa %ymm9, %ymm2
11751; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm16 ^ ymm18))
11752; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u]
11753; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
11754; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u]
11755; AVX512-NEXT:    vpor %xmm5, %xmm2, %xmm2
11756; AVX512-NEXT:    vmovdqa64 352(%rdi), %ymm17
11757; AVX512-NEXT:    vmovdqa64 320(%rdi), %ymm28
11758; AVX512-NEXT:    vmovdqa %ymm14, %ymm7
11759; AVX512-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm28 ^ ymm17))
11760; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
11761; AVX512-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7,8,9],ymm8[10],ymm7[11,12,13],ymm8[14],ymm7[15]
11762; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
11763; AVX512-NEXT:    vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
11764; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm2 & ymm23)
11765; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
11766; AVX512-NEXT:    vmovdqa %ymm7, %ymm2
11767; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm13 ^ (ymm2 & (ymm12 ^ ymm13))
11768; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm15
11769; AVX512-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u]
11770; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u]
11771; AVX512-NEXT:    vpor %xmm2, %xmm15, %xmm2
11772; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm15
11773; AVX512-NEXT:    vmovdqa64 %ymm24, %ymm5
11774; AVX512-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm19 ^ (ymm15 & (ymm31 ^ ymm19))
11775; AVX512-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm11[2],ymm15[3,4,5],ymm11[6],ymm15[7,8,9],ymm11[10],ymm15[11,12,13],ymm11[14],ymm15[15]
11776; AVX512-NEXT:    vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
11777; AVX512-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm2 & ~mem)
11778; AVX512-NEXT:    vmovdqa %ymm9, %ymm2
11779; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm21 ^ (ymm2 & (ymm29 ^ ymm21))
11780; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u]
11781; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
11782; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u]
11783; AVX512-NEXT:    vpor %xmm0, %xmm2, %xmm0
11784; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u,u,u,u,u]
11785; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm25
11786; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u]
11787; AVX512-NEXT:    vmovdqa64 %xmm6, %xmm26
11788; AVX512-NEXT:    vpor %xmm2, %xmm3, %xmm2
11789; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
11790; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm1
11791; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
11792; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
11793; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
11794; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
11795; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
11796; AVX512-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm24
11797; AVX512-NEXT:    vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm4 & (zmm24 ^ zmm15))
11798; AVX512-NEXT:    vmovdqa %ymm14, %ymm0
11799; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm12 ^ ymm13))
11800; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u]
11801; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
11802; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u]
11803; AVX512-NEXT:    vpor %xmm2, %xmm0, %xmm0
11804; AVX512-NEXT:    vmovdqa %ymm7, %ymm2
11805; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm19 ^ (ymm2 & (ymm31 ^ ymm19))
11806; AVX512-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11807; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm11[3],ymm2[4,5],ymm11[6],ymm2[7,8,9,10],ymm11[11],ymm2[12,13],ymm11[14],ymm2[15]
11808; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
11809; AVX512-NEXT:    vpmovsxdq {{.*#+}} ymm15 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
11810; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ymm15)
11811; AVX512-NEXT:    vmovdqa %ymm5, %ymm0
11812; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm29 ^ (ymm0 & (ymm21 ^ ymm29))
11813; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm3
11814; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u]
11815; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u]
11816; AVX512-NEXT:    vpor %xmm3, %xmm0, %xmm0
11817; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
11818; AVX512-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11819; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm3
11820; AVX512-NEXT:    vpshufb %xmm3, %xmm10, %xmm3
11821; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
11822; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm5
11823; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
11824; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
11825; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
11826; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm1
11827; AVX512-NEXT:    vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11828; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
11829; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm4
11830; AVX512-NEXT:    vmovdqa64 %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11831; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
11832; AVX512-NEXT:    vpor %xmm3, %xmm6, %xmm3
11833; AVX512-NEXT:    vinserti32x4 $2, %xmm3, %zmm0, %zmm25
11834; AVX512-NEXT:    vpmovsxdq {{.*#+}} zmm20 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
11835; AVX512-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm2 ^ (zmm20 & (zmm25 ^ zmm2))
11836; AVX512-NEXT:    vmovdqa %ymm9, %ymm0
11837; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm12 ^ ymm13))
11838; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u]
11839; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
11840; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u]
11841; AVX512-NEXT:    vpor %xmm2, %xmm0, %xmm0
11842; AVX512-NEXT:    vmovdqa %ymm14, %ymm2
11843; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm19 ^ (ymm2 & (ymm31 ^ ymm19))
11844; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm11[0],ymm2[1,2],ymm11[3],ymm2[4,5,6],ymm11[7,8],ymm2[9,10],ymm11[11],ymm2[12,13,14],ymm11[15]
11845; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
11846; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm15)
11847; AVX512-NEXT:    vmovdqa %ymm15, %ymm11
11848; AVX512-NEXT:    vmovdqa %ymm7, %ymm0
11849; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm29 ^ (ymm0 & (ymm21 ^ ymm29))
11850; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
11851; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[2,9,u,u,u,u,u]
11852; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u]
11853; AVX512-NEXT:    vpor %xmm2, %xmm0, %xmm0
11854; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
11855; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
11856; AVX512-NEXT:    vpshufb %xmm2, %xmm10, %xmm6
11857; AVX512-NEXT:    vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
11858; AVX512-NEXT:    vmovdqa %xmm5, %xmm10
11859; AVX512-NEXT:    vpor %xmm6, %xmm15, %xmm6
11860; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
11861; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm0))
11862; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
11863; AVX512-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
11864; AVX512-NEXT:    vpor %xmm0, %xmm15, %xmm0
11865; AVX512-NEXT:    vmovdqa64 416(%rdi), %ymm26
11866; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm6, %zmm30
11867; AVX512-NEXT:    vmovdqa64 384(%rdi), %ymm27
11868; AVX512-NEXT:    vpternlogq {{.*#+}} zmm30 = zmm3 ^ (zmm20 & (zmm30 ^ zmm3))
11869; AVX512-NEXT:    vmovdqa %ymm7, %ymm0
11870; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26))
11871; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm3
11872; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9]
11873; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero
11874; AVX512-NEXT:    vpor %xmm3, %xmm0, %xmm0
11875; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
11876; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
11877; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm1 & (ymm0 ^ ymm8))
11878; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm20
11879; AVX512-NEXT:    vpmovsxwd {{.*#+}} zmm8 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
11880; AVX512-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm8 & (zmm20 ^ zmm22))
11881; AVX512-NEXT:    vmovdqa %ymm7, %ymm0
11882; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm18 ^ ymm16))
11883; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm3
11884; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u]
11885; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u]
11886; AVX512-NEXT:    vpor %xmm3, %xmm0, %xmm0
11887; AVX512-NEXT:    vmovdqa %ymm9, %ymm3
11888; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm17 ^ (ymm3 & (ymm28 ^ ymm17))
11889; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
11890; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7,8,9,10],ymm6[11],ymm3[12,13],ymm6[14],ymm3[15]
11891; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
11892; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm23)
11893; AVX512-NEXT:    vmovdqa %ymm14, %ymm0
11894; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26))
11895; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero
11896; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
11897; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10]
11898; AVX512-NEXT:    vpor %xmm6, %xmm0, %xmm0
11899; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
11900; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm1 & (ymm0 ^ ymm3))
11901; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm22
11902; AVX512-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm8 & (zmm22 ^ zmm24))
11903; AVX512-NEXT:    vmovdqa %ymm14, %ymm0
11904; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm18 ^ ymm16))
11905; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm3
11906; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u]
11907; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u]
11908; AVX512-NEXT:    vpor %xmm3, %xmm0, %xmm0
11909; AVX512-NEXT:    vmovdqa %ymm7, %ymm3
11910; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm28 ^ (ymm3 & (ymm17 ^ ymm28))
11911; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
11912; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1,2],ymm6[3],ymm3[4,5,6],ymm6[7,8],ymm3[9,10],ymm6[11],ymm3[12,13,14],ymm6[15]
11913; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
11914; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm23)
11915; AVX512-NEXT:    vmovdqa %ymm9, %ymm0
11916; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26))
11917; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero
11918; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
11919; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11]
11920; AVX512-NEXT:    vpor %xmm6, %xmm0, %xmm0
11921; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
11922; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm1 & (ymm0 ^ ymm3))
11923; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm24
11924; AVX512-NEXT:    vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm8 & (zmm24 ^ zmm25))
11925; AVX512-NEXT:    vmovdqa %ymm14, %ymm0
11926; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm28 ^ (ymm0 & (ymm17 ^ ymm28))
11927; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
11928; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6],ymm3[7,8],ymm0[9,10,11],ymm3[12],ymm0[13,14],ymm3[15]
11929; AVX512-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
11930; AVX512-NEXT:    vmovdqa %ymm9, %ymm2
11931; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm16 ^ (ymm2 & (ymm18 ^ ymm16))
11932; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u]
11933; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
11934; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u]
11935; AVX512-NEXT:    vpor %xmm3, %xmm2, %xmm2
11936; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm23) | ymm0
11937; AVX512-NEXT:    vmovdqa %ymm7, %ymm0
11938; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27))
11939; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm3
11940; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12]
11941; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero
11942; AVX512-NEXT:    vpor %xmm3, %xmm0, %xmm0
11943; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
11944; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm1 & (ymm0 ^ ymm2))
11945; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm25
11946; AVX512-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm8 & (zmm25 ^ zmm30))
11947; AVX512-NEXT:    vmovdqa %ymm9, %ymm0
11948; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm28 ^ (ymm0 & (ymm17 ^ ymm28))
11949; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
11950; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15]
11951; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
11952; AVX512-NEXT:    vmovdqa %ymm4, %ymm2
11953; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm16 ^ ymm18))
11954; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
11955; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u]
11956; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u]
11957; AVX512-NEXT:    vpor %xmm3, %xmm2, %xmm2
11958; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
11959; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm23) | ymm0
11960; AVX512-NEXT:    vmovdqa %ymm14, %ymm0
11961; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27))
11962; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm3
11963; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13]
11964; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero
11965; AVX512-NEXT:    vpor %xmm3, %xmm0, %xmm0
11966; AVX512-NEXT:    vinserti32x4 $1, %xmm0, %ymm0, %ymm30
11967; AVX512-NEXT:    vpternlogq {{.*#+}} ymm30 = ymm30 ^ (ymm1 & (ymm30 ^ ymm2))
11968; AVX512-NEXT:    vmovdqa %ymm4, %ymm0
11969; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm28 ^ ymm17))
11970; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
11971; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
11972; AVX512-NEXT:    vmovdqa %ymm7, %ymm2
11973; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm16 ^ ymm18))
11974; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
11975; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u]
11976; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u]
11977; AVX512-NEXT:    vpor %xmm3, %xmm2, %xmm2
11978; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
11979; AVX512-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
11980; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ~mem) | ymm0
11981; AVX512-NEXT:    vmovdqa %ymm9, %ymm0
11982; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27))
11983; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero
11984; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
11985; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14]
11986; AVX512-NEXT:    vpor %xmm3, %xmm0, %xmm0
11987; AVX512-NEXT:    vinserti32x4 $1, %xmm0, %ymm0, %ymm23
11988; AVX512-NEXT:    vpternlogq {{.*#+}} ymm23 = ymm23 ^ (ymm1 & (ymm23 ^ ymm2))
11989; AVX512-NEXT:    vmovdqa %ymm7, %ymm0
11990; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm13 ^ ymm12))
11991; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
11992; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u]
11993; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u]
11994; AVX512-NEXT:    vpor %xmm2, %xmm0, %xmm0
11995; AVX512-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm26 ^ (ymm4 & (ymm27 ^ ymm26))
11996; AVX512-NEXT:    vmovdqa %ymm14, %ymm2
11997; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm12 ^ (ymm2 & (ymm13 ^ ymm12))
11998; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
11999; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u]
12000; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u,u,u,u,u,u]
12001; AVX512-NEXT:    vpor %xmm3, %xmm2, %xmm2
12002; AVX512-NEXT:    vmovdqa %ymm9, %ymm3
12003; AVX512-NEXT:    vmovdqa %ymm9, %ymm15
12004; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm12 ^ (ymm9 & (ymm13 ^ ymm12))
12005; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm19 ^ (ymm3 & (ymm31 ^ ymm19))
12006; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u]
12007; AVX512-NEXT:    vextracti128 $1, %ymm9, %xmm9
12008; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[4,11],zero,zero,xmm9[0,7,14,u,u,u,u,u,u,u]
12009; AVX512-NEXT:    vpor %xmm6, %xmm9, %xmm6
12010; AVX512-NEXT:    vmovdqa %ymm14, %ymm12
12011; AVX512-NEXT:    vpternlogq {{.*#+}} ymm16 = ymm18 ^ (ymm14 & (ymm16 ^ ymm18))
12012; AVX512-NEXT:    vmovdqa %ymm7, %ymm9
12013; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm19 ^ ymm31))
12014; AVX512-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm31 ^ (ymm14 & (ymm19 ^ ymm31))
12015; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
12016; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1,2,3],ymm8[4],ymm3[5,6],ymm8[7,8],ymm3[9,10,11],ymm8[12],ymm3[13,14],ymm8[15]
12017; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12018; AVX512-NEXT:    vmovdqa %ymm11, %ymm1
12019; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm11)
12020; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15]
12021; AVX512-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12022; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm2 & ymm1)
12023; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm14[0],ymm8[1],ymm14[2,3,4],ymm8[5],ymm14[6,7,8],ymm8[9],ymm14[10,11,12],ymm8[13],ymm14[14,15]
12024; AVX512-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12025; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm6 & ymm1)
12026; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm29 ^ (ymm12 & (ymm21 ^ ymm29))
12027; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u]
12028; AVX512-NEXT:    vextracti128 $1, %ymm12, %xmm2
12029; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u]
12030; AVX512-NEXT:    vpor %xmm0, %xmm2, %xmm0
12031; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
12032; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
12033; AVX512-NEXT:    vpshufb %xmm1, %xmm12, %xmm2
12034; AVX512-NEXT:    vmovdqa %xmm10, %xmm13
12035; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
12036; AVX512-NEXT:    vpor %xmm2, %xmm6, %xmm2
12037; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
12038; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
12039; AVX512-NEXT:    vpmovsxdq {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
12040; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm1 & (ymm2 ^ ymm0))
12041; AVX512-NEXT:    vpternlogq {{.*#+}} ymm28 = ymm17 ^ (ymm7 & (ymm28 ^ ymm17))
12042; AVX512-NEXT:    vmovd {{.*#+}} xmm10 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
12043; AVX512-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm29 ^ (ymm15 & (ymm21 ^ ymm29))
12044; AVX512-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm21 ^ (ymm7 & (ymm29 ^ ymm21))
12045; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
12046; AVX512-NEXT:    vpshufb %xmm10, %xmm8, %xmm0
12047; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
12048; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm14[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
12049; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
12050; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm2
12051; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
12052; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ zmm3))
12053; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm15[u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u]
12054; AVX512-NEXT:    vextracti128 $1, %ymm15, %xmm6
12055; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u]
12056; AVX512-NEXT:    vpor %xmm3, %xmm6, %xmm3
12057; AVX512-NEXT:    vpshufb %xmm5, %xmm12, %xmm6
12058; AVX512-NEXT:    vmovdqa %xmm12, %xmm15
12059; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
12060; AVX512-NEXT:    vpor %xmm6, %xmm12, %xmm6
12061; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
12062; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
12063; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm1 & (ymm6 ^ ymm3))
12064; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm3
12065; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u]
12066; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u]
12067; AVX512-NEXT:    vpor %xmm3, %xmm7, %xmm3
12068; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
12069; AVX512-NEXT:    vpshufb %xmm7, %xmm15, %xmm12
12070; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
12071; AVX512-NEXT:    vpor %xmm13, %xmm12, %xmm12
12072; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
12073; AVX512-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
12074; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm1 & (ymm12 ^ ymm3))
12075; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm14[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
12076; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
12077; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3]
12078; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm6, %zmm3
12079; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ zmm11))
12080; AVX512-NEXT:    vpshufb %xmm10, %xmm14, %xmm6
12081; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
12082; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
12083; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm12, %zmm6
12084; AVX512-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm0 & (zmm6 ^ zmm9))
12085; AVX512-NEXT:    vinserti64x4 $1, %ymm30, %zmm0, %zmm0
12086; AVX512-NEXT:    movw $-512, %ax # imm = 0xFE00
12087; AVX512-NEXT:    kmovw %eax, %k1
12088; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
12089; AVX512-NEXT:    vinserti64x4 $1, %ymm23, %zmm0, %zmm0
12090; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
12091; AVX512-NEXT:    vmovdqa64 %ymm28, %ymm1
12092; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm28[2,3,0,1]
12093; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
12094; AVX512-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
12095; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm1
12096; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u]
12097; AVX512-NEXT:    vextracti32x4 $1, %ymm16, %xmm1
12098; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u]
12099; AVX512-NEXT:    vpor %xmm5, %xmm1, %xmm1
12100; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm0
12101; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm0
12102; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
12103; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero
12104; AVX512-NEXT:    vpor %xmm0, %xmm4, %xmm0
12105; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
12106; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
12107; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12108; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
12109; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm6 {%k1}
12110; AVX512-NEXT:    vmovdqa64 %zmm20, (%rsi)
12111; AVX512-NEXT:    vmovdqa64 %zmm22, (%rdx)
12112; AVX512-NEXT:    vmovdqa64 %zmm24, (%rcx)
12113; AVX512-NEXT:    vmovdqa64 %zmm25, (%r8)
12114; AVX512-NEXT:    vmovdqa64 %zmm2, (%r9)
12115; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
12116; AVX512-NEXT:    vmovdqa64 %zmm3, (%rax)
12117; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
12118; AVX512-NEXT:    vmovdqa64 %zmm6, (%rax)
12119; AVX512-NEXT:    vzeroupper
12120; AVX512-NEXT:    retq
12121;
12122; AVX512-FCP-LABEL: load_i8_stride7_vf64:
12123; AVX512-FCP:       # %bb.0:
12124; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
12125; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %ymm20
12126; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm12
12127; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm27
12128; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
12129; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm20 ^ ymm12))
12130; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
12131; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u]
12132; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
12133; AVX512-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
12134; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
12135; AVX512-FCP-NEXT:    vmovdqa64 96(%rdi), %ymm31
12136; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm1
12137; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm27 ^ (ymm1 & (ymm31 ^ ymm27))
12138; AVX512-FCP-NEXT:    vmovdqa 80(%rdi), %xmm6
12139; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7,8,9],ymm6[10],ymm1[11,12],ymm6[13],ymm1[14,15]
12140; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12141; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & mem)
12142; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
12143; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm28
12144; AVX512-FCP-NEXT:    vmovdqa64 160(%rdi), %ymm30
12145; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm0
12146; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm28 ^ (ymm0 & (ymm30 ^ ymm28))
12147; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
12148; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u]
12149; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u]
12150; AVX512-FCP-NEXT:    vpor %xmm2, %xmm0, %xmm0
12151; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
12152; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,2,4,6]
12153; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %ymm3
12154; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm2, %ymm2
12155; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
12156; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
12157; AVX512-FCP-NEXT:    vmovdqa 240(%rdi), %xmm4
12158; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u]
12159; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %xmm5
12160; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
12161; AVX512-FCP-NEXT:    vpor %xmm2, %xmm7, %xmm2
12162; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm8
12163; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
12164; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm21 & (zmm8 ^ zmm1))
12165; AVX512-FCP-NEXT:    vmovdqa64 288(%rdi), %ymm16
12166; AVX512-FCP-NEXT:    vmovdqa 256(%rdi), %ymm11
12167; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm0
12168; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm11 ^ ymm16))
12169; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u]
12170; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
12171; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u]
12172; AVX512-FCP-NEXT:    vpor %xmm2, %xmm0, %xmm0
12173; AVX512-FCP-NEXT:    vmovdqa 352(%rdi), %ymm14
12174; AVX512-FCP-NEXT:    vmovdqa 320(%rdi), %ymm2
12175; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm7
12176; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm14 ^ (ymm7 & (ymm2 ^ ymm14))
12177; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm7[2,3,0,1]
12178; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm13[2],ymm7[3,4,5],ymm13[6],ymm7[7,8,9],ymm13[10],ymm7[11,12,13],ymm13[14],ymm7[15]
12179; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
12180; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm26 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
12181; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm0 & ymm26)
12182; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
12183; AVX512-FCP-NEXT:    vmovdqa64 416(%rdi), %ymm17
12184; AVX512-FCP-NEXT:    vmovdqa64 384(%rdi), %ymm18
12185; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm7
12186; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm18 ^ ymm17))
12187; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm15
12188; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9]
12189; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm7[4,11],zero,zero
12190; AVX512-FCP-NEXT:    vpor %xmm7, %xmm15, %xmm7
12191; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm15
12192; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm23 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
12193; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 ^ (ymm23 & (ymm15 ^ ymm13))
12194; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm0
12195; AVX512-FCP-NEXT:    vpmovsxwd {{.*#+}} zmm29 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
12196; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm29 & (zmm0 ^ zmm8))
12197; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12198; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm8
12199; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm12 ^ (ymm8 & (ymm20 ^ ymm12))
12200; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm13
12201; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u,u,u]
12202; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u,u,u]
12203; AVX512-FCP-NEXT:    vpor %xmm13, %xmm8, %xmm8
12204; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm13
12205; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm31 ^ (ymm13 & (ymm27 ^ ymm31))
12206; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm6[2],ymm13[3,4,5],ymm6[6],ymm13[7,8,9],ymm6[10],ymm13[11,12,13],ymm6[14],ymm13[15]
12207; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
12208; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm8 & ~mem)
12209; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm8
12210; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm28 ^ (ymm8 & (ymm30 ^ ymm28))
12211; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,6,13],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u]
12212; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
12213; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[4,11],zero,zero,xmm8[0,7,14,u,u,u,u]
12214; AVX512-FCP-NEXT:    vpor %xmm15, %xmm8, %xmm8
12215; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
12216; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,4,6]
12217; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm15, %ymm15
12218; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
12219; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm15[7]
12220; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u]
12221; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
12222; AVX512-FCP-NEXT:    vpor %xmm7, %xmm15, %xmm7
12223; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm7, %zmm8, %zmm7
12224; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm21 & (zmm7 ^ zmm13))
12225; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm8
12226; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm11 ^ (ymm8 & (ymm16 ^ ymm11))
12227; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm13
12228; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u]
12229; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u]
12230; AVX512-FCP-NEXT:    vpor %xmm13, %xmm8, %xmm8
12231; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm13
12232; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm14 ^ (ymm13 & (ymm2 ^ ymm14))
12233; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1]
12234; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8,9,10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15]
12235; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
12236; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm8 & ymm26)
12237; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm8
12238; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm17 ^ (ymm8 & (ymm18 ^ ymm17))
12239; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero
12240; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
12241; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10]
12242; AVX512-FCP-NEXT:    vpor %xmm15, %xmm8, %xmm8
12243; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
12244; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm23 & (ymm8 ^ ymm13))
12245; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm24
12246; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm29 & (zmm24 ^ zmm7))
12247; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm7
12248; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm20 ^ ymm12))
12249; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u]
12250; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
12251; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u]
12252; AVX512-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
12253; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm8
12254; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31))
12255; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8,9,10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15]
12256; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12257; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm25 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
12258; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm25)
12259; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm7
12260; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm28 ^ ymm30))
12261; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm13
12262; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[1,8,15,u,u,u,u]
12263; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u]
12264; AVX512-FCP-NEXT:    vpor %xmm7, %xmm13, %xmm7
12265; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
12266; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,5,6]
12267; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm13, %ymm3
12268; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
12269; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,6],ymm3[7]
12270; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
12271; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
12272; AVX512-FCP-NEXT:    vpor %xmm7, %xmm13, %xmm7
12273; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm7, %zmm3, %zmm3
12274; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm22 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
12275; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm8 ^ (zmm22 & (zmm3 ^ zmm8))
12276; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm7
12277; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm11 ^ (ymm7 & (ymm16 ^ ymm11))
12278; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
12279; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u]
12280; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u]
12281; AVX512-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
12282; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm8
12283; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm2 ^ (ymm8 & (ymm14 ^ ymm2))
12284; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1]
12285; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1,2],ymm13[3],ymm8[4,5,6],ymm13[7,8],ymm8[9,10],ymm13[11],ymm8[12,13,14],ymm13[15]
12286; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
12287; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm26)
12288; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm7
12289; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm18 ^ ymm17))
12290; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero
12291; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
12292; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11]
12293; AVX512-FCP-NEXT:    vpor %xmm7, %xmm13, %xmm7
12294; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
12295; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm23 & (ymm7 ^ ymm8))
12296; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm21
12297; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm29 & (zmm21 ^ zmm3))
12298; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm3
12299; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm12 ^ (ymm3 & (ymm20 ^ ymm12))
12300; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u,u,u]
12301; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm3
12302; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u,u,u]
12303; AVX512-FCP-NEXT:    vpor %xmm7, %xmm3, %xmm3
12304; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm7
12305; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm27 ^ ymm31))
12306; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4,5,6],ymm6[7,8],ymm7[9,10],ymm6[11],ymm7[12,13,14],ymm6[15]
12307; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12308; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm25)
12309; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm3
12310; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm30 ^ (ymm3 & (ymm28 ^ ymm30))
12311; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm8
12312; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u]
12313; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u]
12314; AVX512-FCP-NEXT:    vpor %xmm3, %xmm8, %xmm8
12315; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
12316; AVX512-FCP-NEXT:    vmovdqa 208(%rdi), %xmm0
12317; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12318; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm0
12319; AVX512-FCP-NEXT:    vmovdqa 192(%rdi), %xmm15
12320; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
12321; AVX512-FCP-NEXT:    vpor %xmm0, %xmm3, %xmm0
12322; AVX512-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm3
12323; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
12324; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm3))
12325; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
12326; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
12327; AVX512-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
12328; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm3, %zmm0, %zmm0
12329; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm7 ^ (zmm22 & (zmm0 ^ zmm7))
12330; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm3
12331; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm14 ^ ymm2))
12332; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
12333; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6],ymm4[7,8],ymm3[9,10,11],ymm4[12],ymm3[13,14],ymm4[15]
12334; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm3, %ymm3
12335; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm4
12336; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm11 ^ (ymm4 & (ymm16 ^ ymm11))
12337; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,6,13],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[u,u]
12338; AVX512-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
12339; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u]
12340; AVX512-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
12341; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = (ymm4 & ymm26) | ymm3
12342; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm3
12343; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm17 ^ ymm18))
12344; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm5
12345; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12]
12346; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero
12347; AVX512-FCP-NEXT:    vpor %xmm5, %xmm3, %xmm3
12348; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
12349; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm23 & (ymm3 ^ ymm4))
12350; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm22
12351; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm29 & (zmm22 ^ zmm0))
12352; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm0
12353; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm14 ^ ymm2))
12354; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
12355; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7,8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14,15]
12356; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
12357; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm3
12358; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm11 ^ ymm16))
12359; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm4
12360; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u]
12361; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u]
12362; AVX512-FCP-NEXT:    vpor %xmm4, %xmm3, %xmm3
12363; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm26) | ymm0
12364; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm0
12365; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm18 ^ (ymm0 & (ymm17 ^ ymm18))
12366; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm4
12367; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13]
12368; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero
12369; AVX512-FCP-NEXT:    vpor %xmm4, %xmm0, %xmm0
12370; AVX512-FCP-NEXT:    vinserti32x4 $1, %xmm0, %ymm0, %ymm29
12371; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm29 = ymm29 ^ (ymm23 & (ymm29 ^ ymm3))
12372; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
12373; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
12374; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm2 ^ ymm14))
12375; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
12376; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15]
12377; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
12378; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm3
12379; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm11 ^ ymm16))
12380; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm7
12381; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u]
12382; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u]
12383; AVX512-FCP-NEXT:    vpor %xmm7, %xmm3, %xmm3
12384; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ~mem) | ymm0
12385; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm0
12386; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm18 ^ (ymm0 & (ymm17 ^ ymm18))
12387; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero
12388; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
12389; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14]
12390; AVX512-FCP-NEXT:    vpor %xmm7, %xmm0, %xmm0
12391; AVX512-FCP-NEXT:    vinserti32x4 $1, %xmm0, %ymm0, %ymm26
12392; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm26 = ymm26 ^ (ymm23 & (ymm26 ^ ymm3))
12393; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm13
12394; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm20 ^ (ymm13 & (ymm12 ^ ymm20))
12395; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm7
12396; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm19 = ymm17 ^ (ymm19 & (ymm18 ^ ymm17))
12397; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm0
12398; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm12 ^ ymm20))
12399; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm8
12400; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm20 ^ (ymm10 & (ymm12 ^ ymm20))
12401; AVX512-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm3
12402; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u]
12403; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u]
12404; AVX512-FCP-NEXT:    vpor %xmm3, %xmm12, %xmm3
12405; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm27 ^ ymm31))
12406; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm12
12407; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u]
12408; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u]
12409; AVX512-FCP-NEXT:    vpor %xmm0, %xmm12, %xmm0
12410; AVX512-FCP-NEXT:    vmovdqa %ymm9, %ymm12
12411; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm16 ^ (ymm9 & (ymm11 ^ ymm16))
12412; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm13
12413; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm27 ^ (ymm13 & (ymm31 ^ ymm27))
12414; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm27 ^ (ymm9 & (ymm31 ^ ymm27))
12415; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u]
12416; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm10
12417; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[4,11],zero,zero,xmm10[0,7,14,u,u,u,u,u,u,u]
12418; AVX512-FCP-NEXT:    vpor %xmm4, %xmm10, %xmm4
12419; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6],ymm6[7,8],ymm7[9,10,11],ymm6[12],ymm7[13,14],ymm6[15]
12420; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12421; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm3 & ymm25)
12422; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5,6,7,8],ymm6[9],ymm13[10,11],ymm6[12],ymm13[13,14,15]
12423; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12424; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm0 & ymm25)
12425; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7,8],ymm6[9],ymm9[10,11,12],ymm6[13],ymm9[14,15]
12426; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12427; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm4 & ymm25)
12428; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm1 & (ymm2 ^ ymm14))
12429; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm28 ^ ymm30))
12430; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm30 ^ (ymm8 & (ymm28 ^ ymm30))
12431; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm28 ^ (ymm1 & (ymm30 ^ ymm28))
12432; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u]
12433; AVX512-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm3
12434; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u]
12435; AVX512-FCP-NEXT:    vpor %xmm0, %xmm3, %xmm0
12436; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
12437; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
12438; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm12, %xmm3
12439; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
12440; AVX512-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
12441; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
12442; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
12443; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm9 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
12444; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm9 & (ymm3 ^ ymm0))
12445; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u]
12446; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm4
12447; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u]
12448; AVX512-FCP-NEXT:    vpor %xmm0, %xmm4, %xmm0
12449; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm12, %xmm4
12450; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm15[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
12451; AVX512-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
12452; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
12453; AVX512-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
12454; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm9 & (ymm4 ^ ymm0))
12455; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm0
12456; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[5,12,u,u,u,u,u]
12457; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u]
12458; AVX512-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm1
12459; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
12460; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm5
12461; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm15[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
12462; AVX512-FCP-NEXT:    vpor %xmm5, %xmm8, %xmm5
12463; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
12464; AVX512-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
12465; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm9 & (ymm5 ^ ymm1))
12466; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [1,2,4,6,0,0,0,0]
12467; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %ymm8
12468; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm1, %ymm1
12469; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
12470; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm3, %zmm1
12471; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
12472; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm3 & (zmm1 ^ zmm10))
12473; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [1,3,4,6,0,0,0,0]
12474; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm9, %ymm9
12475; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
12476; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm9, %zmm4, %zmm4
12477; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm3 & (zmm4 ^ zmm7))
12478; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,3,5,6,0,0,0,0]
12479; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm7, %ymm7
12480; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
12481; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm7, %zmm5, %zmm5
12482; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm3 & (zmm5 ^ zmm6))
12483; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm29, %zmm0, %zmm3
12484; AVX512-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
12485; AVX512-FCP-NEXT:    kmovw %eax, %k1
12486; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm1 {%k1}
12487; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm26, %zmm0, %zmm3
12488; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm4 {%k1}
12489; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
12490; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
12491; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
12492; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm11[u,u,u,u,2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u]
12493; AVX512-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm3
12494; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u]
12495; AVX512-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
12496; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & mem) | ymm0
12497; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm3
12498; AVX512-FCP-NEXT:    vextracti32x4 $1, %ymm19, %xmm0
12499; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
12500; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero
12501; AVX512-FCP-NEXT:    vpor %xmm0, %xmm3, %xmm0
12502; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
12503; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
12504; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
12505; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
12506; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm5 {%k1}
12507; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
12508; AVX512-FCP-NEXT:    vmovaps %zmm0, (%rsi)
12509; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, (%rdx)
12510; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, (%rcx)
12511; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, (%r8)
12512; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, (%r9)
12513; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
12514; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
12515; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
12516; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
12517; AVX512-FCP-NEXT:    vzeroupper
12518; AVX512-FCP-NEXT:    retq
12519;
12520; AVX512DQ-LABEL: load_i8_stride7_vf64:
12521; AVX512DQ:       # %bb.0:
12522; AVX512DQ-NEXT:    subq $24, %rsp
12523; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
12524; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm12
12525; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm13
12526; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %ymm31
12527; AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
12528; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm23
12529; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm13 ^ (ymm1 & (ymm12 ^ ymm13))
12530; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
12531; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u]
12532; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
12533; AVX512DQ-NEXT:    vpor %xmm2, %xmm1, %xmm1
12534; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
12535; AVX512DQ-NEXT:    vmovdqa64 96(%rdi), %ymm28
12536; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm2
12537; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm31 ^ (ymm2 & (ymm28 ^ ymm31))
12538; AVX512DQ-NEXT:    vmovdqa 80(%rdi), %xmm11
12539; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm11[2],ymm2[3,4],ymm11[5],ymm2[6,7,8,9],ymm11[10],ymm2[11,12],ymm11[13],ymm2[14,15]
12540; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12541; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm1 & mem)
12542; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
12543; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %ymm25
12544; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm4
12545; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm1
12546; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm25 ^ (ymm1 & (ymm4 ^ ymm25))
12547; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm3
12548; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u]
12549; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
12550; AVX512DQ-NEXT:    vpor %xmm3, %xmm1, %xmm1
12551; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
12552; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %xmm0
12553; AVX512DQ-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
12554; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm0, %xmm3
12555; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm30
12556; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm20
12557; AVX512DQ-NEXT:    vmovdqa 208(%rdi), %xmm10
12558; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
12559; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
12560; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
12561; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
12562; AVX512DQ-NEXT:    vmovdqa 240(%rdi), %xmm3
12563; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u,u,u,u,u]
12564; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %xmm5
12565; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
12566; AVX512DQ-NEXT:    vpor %xmm6, %xmm7, %xmm6
12567; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm6, %zmm1, %zmm22
12568; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
12569; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm19 & (zmm22 ^ zmm2))
12570; AVX512DQ-NEXT:    vmovdqa64 288(%rdi), %ymm18
12571; AVX512DQ-NEXT:    vmovdqa64 256(%rdi), %ymm17
12572; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm2
12573; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm17 ^ ymm18))
12574; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u]
12575; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
12576; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u]
12577; AVX512DQ-NEXT:    vpor %xmm6, %xmm2, %xmm2
12578; AVX512DQ-NEXT:    vmovdqa64 352(%rdi), %ymm21
12579; AVX512DQ-NEXT:    vmovdqa64 320(%rdi), %ymm16
12580; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm7
12581; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm21 ^ (ymm7 & (ymm16 ^ ymm21))
12582; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
12583; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7,8,9],ymm8[10],ymm7[11,12,13],ymm8[14],ymm7[15]
12584; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
12585; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} ymm24 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
12586; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm2 & ymm24)
12587; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
12588; AVX512DQ-NEXT:    vmovdqa %ymm7, %ymm2
12589; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm13 ^ (ymm2 & (ymm12 ^ ymm13))
12590; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm15
12591; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u]
12592; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u]
12593; AVX512DQ-NEXT:    vpor %xmm2, %xmm15, %xmm2
12594; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm15
12595; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm29
12596; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm28 ^ (ymm15 & (ymm31 ^ ymm28))
12597; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm11[2],ymm15[3,4,5],ymm11[6],ymm15[7,8,9],ymm11[10],ymm15[11,12,13],ymm11[14],ymm15[15]
12598; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
12599; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm2 & ~mem)
12600; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm2
12601; AVX512DQ-NEXT:    vmovdqa %ymm4, %ymm6
12602; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm4 ^ ymm25))
12603; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u]
12604; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
12605; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u]
12606; AVX512DQ-NEXT:    vpor %xmm0, %xmm2, %xmm0
12607; AVX512DQ-NEXT:    vmovdqa %xmm3, %xmm4
12608; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u,u,u,u,u]
12609; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
12610; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm26
12611; AVX512DQ-NEXT:    vpor %xmm2, %xmm3, %xmm2
12612; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
12613; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm1
12614; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
12615; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
12616; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
12617; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
12618; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
12619; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm23
12620; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm19 & (zmm23 ^ zmm15))
12621; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm0
12622; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm12 ^ ymm13))
12623; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u]
12624; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
12625; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u]
12626; AVX512DQ-NEXT:    vpor %xmm2, %xmm0, %xmm0
12627; AVX512DQ-NEXT:    vmovdqa %ymm7, %ymm2
12628; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm28 ^ (ymm2 & (ymm31 ^ ymm28))
12629; AVX512DQ-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12630; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm11[3],ymm2[4,5],ymm11[6],ymm2[7,8,9,10],ymm11[11],ymm2[12,13],ymm11[14],ymm2[15]
12631; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12632; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} ymm27 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
12633; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ymm27)
12634; AVX512DQ-NEXT:    vmovdqa64 %ymm29, %ymm0
12635; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm15
12636; AVX512DQ-NEXT:    vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12637; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm6 ^ (ymm0 & (ymm25 ^ ymm6))
12638; AVX512DQ-NEXT:    vmovdqa64 %ymm6, %ymm19
12639; AVX512DQ-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12640; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
12641; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u]
12642; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u]
12643; AVX512DQ-NEXT:    vpor %xmm3, %xmm0, %xmm0
12644; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
12645; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm3
12646; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm10, %xmm3
12647; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
12648; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm6
12649; AVX512DQ-NEXT:    vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12650; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
12651; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
12652; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
12653; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm1
12654; AVX512DQ-NEXT:    vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12655; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
12656; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
12657; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12658; AVX512DQ-NEXT:    vpor %xmm3, %xmm5, %xmm3
12659; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm3, %zmm0, %zmm25
12660; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} zmm20 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
12661; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm2 ^ (zmm20 & (zmm25 ^ zmm2))
12662; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm0
12663; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm12 ^ ymm13))
12664; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u]
12665; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
12666; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u]
12667; AVX512DQ-NEXT:    vpor %xmm2, %xmm0, %xmm0
12668; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm2
12669; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm28 ^ (ymm2 & (ymm31 ^ ymm28))
12670; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm11[0],ymm2[1,2],ymm11[3],ymm2[4,5,6],ymm11[7,8],ymm2[9,10],ymm11[11],ymm2[12,13,14],ymm11[15]
12671; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12672; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm27)
12673; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm11
12674; AVX512DQ-NEXT:    vmovdqa %ymm7, %ymm0
12675; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm19 ^ (ymm0 & (ymm15 ^ ymm19))
12676; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
12677; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[2,9,u,u,u,u,u]
12678; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u]
12679; AVX512DQ-NEXT:    vpor %xmm2, %xmm0, %xmm0
12680; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
12681; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
12682; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm10, %xmm5
12683; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
12684; AVX512DQ-NEXT:    vpor %xmm5, %xmm15, %xmm5
12685; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
12686; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm0))
12687; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
12688; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
12689; AVX512DQ-NEXT:    vpor %xmm0, %xmm15, %xmm0
12690; AVX512DQ-NEXT:    vmovdqa64 416(%rdi), %ymm26
12691; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm5, %zmm30
12692; AVX512DQ-NEXT:    vmovdqa64 384(%rdi), %ymm27
12693; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm30 = zmm3 ^ (zmm20 & (zmm30 ^ zmm3))
12694; AVX512DQ-NEXT:    vmovdqa %ymm7, %ymm0
12695; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26))
12696; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
12697; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9]
12698; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero
12699; AVX512DQ-NEXT:    vpor %xmm3, %xmm0, %xmm0
12700; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
12701; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} ymm29 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
12702; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm29 & (ymm0 ^ ymm8))
12703; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm20
12704; AVX512DQ-NEXT:    vpmovsxwd {{.*#+}} zmm8 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
12705; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm8 & (zmm20 ^ zmm22))
12706; AVX512DQ-NEXT:    vmovdqa %ymm7, %ymm0
12707; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm18 ^ ymm17))
12708; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
12709; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u]
12710; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u]
12711; AVX512DQ-NEXT:    vpor %xmm3, %xmm0, %xmm0
12712; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm3
12713; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm21 ^ (ymm3 & (ymm16 ^ ymm21))
12714; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1]
12715; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15]
12716; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
12717; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm24)
12718; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm0
12719; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26))
12720; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero
12721; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
12722; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10]
12723; AVX512DQ-NEXT:    vpor %xmm5, %xmm0, %xmm0
12724; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
12725; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm29 & (ymm0 ^ ymm3))
12726; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm22
12727; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm8 & (zmm22 ^ zmm23))
12728; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm0
12729; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm18 ^ ymm17))
12730; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
12731; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u]
12732; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u]
12733; AVX512DQ-NEXT:    vpor %xmm3, %xmm0, %xmm0
12734; AVX512DQ-NEXT:    vmovdqa %ymm7, %ymm3
12735; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm21 ^ ymm16))
12736; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1]
12737; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1,2],ymm5[3],ymm3[4,5,6],ymm5[7,8],ymm3[9,10],ymm5[11],ymm3[12,13,14],ymm5[15]
12738; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
12739; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm24)
12740; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm0
12741; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26))
12742; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero
12743; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
12744; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11]
12745; AVX512DQ-NEXT:    vpor %xmm5, %xmm0, %xmm0
12746; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
12747; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm29 & (ymm0 ^ ymm3))
12748; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm23
12749; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm8 & (zmm23 ^ zmm25))
12750; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm0
12751; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm21 ^ ymm16))
12752; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
12753; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6],ymm3[7,8],ymm0[9,10,11],ymm3[12],ymm0[13,14],ymm3[15]
12754; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
12755; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm2
12756; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm17 ^ (ymm2 & (ymm18 ^ ymm17))
12757; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u]
12758; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
12759; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u]
12760; AVX512DQ-NEXT:    vpor %xmm3, %xmm2, %xmm2
12761; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm24) | ymm0
12762; AVX512DQ-NEXT:    vmovdqa %ymm7, %ymm0
12763; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27))
12764; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
12765; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12]
12766; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero
12767; AVX512DQ-NEXT:    vpor %xmm3, %xmm0, %xmm0
12768; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
12769; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm29 & (ymm0 ^ ymm2))
12770; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm25
12771; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm8 & (zmm25 ^ zmm30))
12772; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm0
12773; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm21 ^ ymm16))
12774; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
12775; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15]
12776; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
12777; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm2
12778; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm17 ^ ymm18))
12779; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
12780; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u]
12781; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u]
12782; AVX512DQ-NEXT:    vpor %xmm3, %xmm2, %xmm2
12783; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
12784; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm24) | ymm0
12785; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm0
12786; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27))
12787; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
12788; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13]
12789; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero
12790; AVX512DQ-NEXT:    vpor %xmm3, %xmm0, %xmm0
12791; AVX512DQ-NEXT:    vinserti32x4 $1, %xmm0, %ymm0, %ymm24
12792; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm24 = ymm24 ^ (ymm29 & (ymm24 ^ ymm2))
12793; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm0
12794; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm16 ^ ymm21))
12795; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
12796; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
12797; AVX512DQ-NEXT:    vmovdqa %ymm7, %ymm2
12798; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm17 ^ ymm18))
12799; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
12800; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u]
12801; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u]
12802; AVX512DQ-NEXT:    vpor %xmm3, %xmm2, %xmm2
12803; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
12804; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
12805; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ~mem) | ymm0
12806; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm0
12807; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27))
12808; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero
12809; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
12810; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14]
12811; AVX512DQ-NEXT:    vpor %xmm3, %xmm0, %xmm0
12812; AVX512DQ-NEXT:    vinserti32x4 $1, %xmm0, %ymm0, %ymm30
12813; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm30 = ymm30 ^ (ymm29 & (ymm30 ^ ymm2))
12814; AVX512DQ-NEXT:    vmovdqa %ymm7, %ymm0
12815; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm13 ^ ymm12))
12816; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
12817; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u]
12818; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u]
12819; AVX512DQ-NEXT:    vporq %xmm2, %xmm0, %xmm29
12820; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm19 = ymm26 ^ (ymm19 & (ymm27 ^ ymm26))
12821; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm0
12822; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm13 ^ ymm12))
12823; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
12824; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u,u,u,u]
12825; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u]
12826; AVX512DQ-NEXT:    vpor %xmm2, %xmm0, %xmm0
12827; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm2
12828; AVX512DQ-NEXT:    vmovdqa %ymm9, %ymm15
12829; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm12 ^ (ymm9 & (ymm13 ^ ymm12))
12830; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm28 ^ (ymm2 & (ymm31 ^ ymm28))
12831; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u]
12832; AVX512DQ-NEXT:    vextracti128 $1, %ymm9, %xmm5
12833; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u]
12834; AVX512DQ-NEXT:    vpor %xmm3, %xmm5, %xmm3
12835; AVX512DQ-NEXT:    vmovdqa %ymm14, %ymm5
12836; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm17 = ymm18 ^ (ymm14 & (ymm17 ^ ymm18))
12837; AVX512DQ-NEXT:    vmovdqa %ymm7, %ymm9
12838; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm28 ^ ymm31))
12839; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm14 = ymm31 ^ (ymm14 & (ymm28 ^ ymm31))
12840; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12841; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3],ymm4[4],ymm2[5,6],ymm4[7,8],ymm2[9,10,11],ymm4[12],ymm2[13,14],ymm4[15]
12842; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12843; AVX512DQ-NEXT:    vmovdqa %ymm11, %ymm1
12844; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 | (ymm29 & ymm11)
12845; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm2 = ymm9[0],ymm4[1],ymm9[2,3],ymm4[4],ymm9[5,6,7,8],ymm4[9],ymm9[10,11],ymm4[12],ymm9[13,14,15]
12846; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12847; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm0 & ymm1)
12848; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm14[0],ymm4[1],ymm14[2,3,4],ymm4[5],ymm14[6,7,8],ymm4[9],ymm14[10,11,12],ymm4[13],ymm14[14,15]
12849; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12850; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm3 & ymm1)
12851; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
12852; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12853; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm6 ^ (ymm5 & (ymm13 ^ ymm6))
12854; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm5[u,u,2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u,u,u]
12855; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm2
12856; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u]
12857; AVX512DQ-NEXT:    vpor %xmm0, %xmm2, %xmm0
12858; AVX512DQ-NEXT:    vmovdqa %xmm10, %xmm1
12859; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
12860; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm10, %xmm2
12861; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
12862; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
12863; AVX512DQ-NEXT:    vpor %xmm2, %xmm3, %xmm2
12864; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
12865; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
12866; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} ymm18 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
12867; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm18 & (ymm2 ^ ymm0))
12868; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm16 = ymm21 ^ (ymm7 & (ymm16 ^ ymm21))
12869; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm10 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
12870; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm6 ^ (ymm15 & (ymm13 ^ ymm6))
12871; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm13 ^ (ymm7 & (ymm6 ^ ymm13))
12872; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
12873; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm14, %xmm0
12874; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
12875; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm13[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
12876; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
12877; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm2
12878; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
12879; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ zmm12))
12880; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm15[u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u]
12881; AVX512DQ-NEXT:    vextracti128 $1, %ymm15, %xmm5
12882; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[1,8,15],zero,zero,xmm5[4,11,u,u,u,u,u]
12883; AVX512DQ-NEXT:    vpor %xmm3, %xmm5, %xmm3
12884; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm1, %xmm5
12885; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
12886; AVX512DQ-NEXT:    vpor %xmm5, %xmm8, %xmm5
12887; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
12888; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
12889; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm18 & (ymm5 ^ ymm3))
12890; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm3
12891; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u]
12892; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u]
12893; AVX512DQ-NEXT:    vpor %xmm3, %xmm7, %xmm3
12894; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
12895; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm1, %xmm8
12896; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
12897; AVX512DQ-NEXT:    vpor %xmm12, %xmm8, %xmm8
12898; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
12899; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
12900; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm18 & (ymm8 ^ ymm3))
12901; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm13[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
12902; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm14[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
12903; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3]
12904; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm5, %zmm3
12905; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ zmm11))
12906; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm13, %xmm5
12907; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm14[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
12908; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
12909; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm8, %zmm5
12910; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm9))
12911; AVX512DQ-NEXT:    movw $-512, %ax # imm = 0xFE00
12912; AVX512DQ-NEXT:    kmovw %eax, %k1
12913; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm24, %zmm0, %zmm2 {%k1}
12914; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm30, %zmm0, %zmm3 {%k1}
12915; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm1
12916; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm16[2,3,0,1]
12917; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
12918; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
12919; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm1
12920; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u]
12921; AVX512DQ-NEXT:    vextracti32x4 $1, %ymm17, %xmm1
12922; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u]
12923; AVX512DQ-NEXT:    vpor %xmm6, %xmm1, %xmm1
12924; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm0
12925; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm4
12926; AVX512DQ-NEXT:    vextracti32x4 $1, %ymm19, %xmm0
12927; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
12928; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero
12929; AVX512DQ-NEXT:    vpor %xmm0, %xmm4, %xmm0
12930; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
12931; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
12932; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12933; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm5 {%k1}
12934; AVX512DQ-NEXT:    vmovdqa64 %zmm20, (%rsi)
12935; AVX512DQ-NEXT:    vmovdqa64 %zmm22, (%rdx)
12936; AVX512DQ-NEXT:    vmovdqa64 %zmm23, (%rcx)
12937; AVX512DQ-NEXT:    vmovdqa64 %zmm25, (%r8)
12938; AVX512DQ-NEXT:    vmovdqa64 %zmm2, (%r9)
12939; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
12940; AVX512DQ-NEXT:    vmovdqa64 %zmm3, (%rax)
12941; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
12942; AVX512DQ-NEXT:    vmovdqa64 %zmm5, (%rax)
12943; AVX512DQ-NEXT:    addq $24, %rsp
12944; AVX512DQ-NEXT:    vzeroupper
12945; AVX512DQ-NEXT:    retq
12946;
12947; AVX512DQ-FCP-LABEL: load_i8_stride7_vf64:
12948; AVX512DQ-FCP:       # %bb.0:
12949; AVX512DQ-FCP-NEXT:    pushq %rax
12950; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
12951; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm11
12952; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm12
12953; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm27
12954; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm0
12955; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm11 ^ ymm12))
12956; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
12957; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u]
12958; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
12959; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm1
12960; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
12961; AVX512DQ-FCP-NEXT:    vmovdqa64 96(%rdi), %ymm31
12962; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm2
12963; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm27 ^ (ymm2 & (ymm31 ^ ymm27))
12964; AVX512DQ-FCP-NEXT:    vmovdqa 80(%rdi), %xmm3
12965; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
12966; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm20
12967; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12968; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm1 & mem)
12969; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
12970; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %ymm28
12971; AVX512DQ-FCP-NEXT:    vmovdqa64 160(%rdi), %ymm30
12972; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm1
12973; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm28 ^ (ymm1 & (ymm30 ^ ymm28))
12974; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm3
12975; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u]
12976; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
12977; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm1, %xmm1
12978; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
12979; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,2,4,6]
12980; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm22
12981; AVX512DQ-FCP-NEXT:    vpermd %ymm22, %ymm4, %ymm4
12982; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
12983; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
12984; AVX512DQ-FCP-NEXT:    vmovdqa 240(%rdi), %xmm4
12985; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u]
12986; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %xmm5
12987; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
12988; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
12989; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm7, %zmm1, %zmm7
12990; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
12991; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm2))
12992; AVX512DQ-FCP-NEXT:    vmovdqa64 288(%rdi), %ymm16
12993; AVX512DQ-FCP-NEXT:    vmovdqa 256(%rdi), %ymm1
12994; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm2
12995; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm16 ^ (ymm2 & (ymm1 ^ ymm16))
12996; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u]
12997; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm2
12998; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u]
12999; AVX512DQ-FCP-NEXT:    vpor %xmm2, %xmm10, %xmm10
13000; AVX512DQ-FCP-NEXT:    vmovdqa 352(%rdi), %ymm14
13001; AVX512DQ-FCP-NEXT:    vmovdqa 320(%rdi), %ymm2
13002; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm13
13003; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm14 ^ (ymm13 & (ymm2 ^ ymm14))
13004; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm13[2,3,0,1]
13005; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4,5],ymm8[6],ymm13[7,8,9],ymm8[10],ymm13[11,12,13],ymm8[14],ymm13[15]
13006; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
13007; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm26 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
13008; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm10 & ymm26)
13009; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
13010; AVX512DQ-FCP-NEXT:    vmovdqa64 416(%rdi), %ymm17
13011; AVX512DQ-FCP-NEXT:    vmovdqa64 384(%rdi), %ymm18
13012; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm13
13013; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm17 ^ (ymm13 & (ymm18 ^ ymm17))
13014; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm15
13015; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9]
13016; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero
13017; AVX512DQ-FCP-NEXT:    vpor %xmm15, %xmm13, %xmm13
13018; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
13019; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm21 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
13020; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm21 & (ymm13 ^ ymm8))
13021; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm0, %zmm3
13022; AVX512DQ-FCP-NEXT:    vpmovsxwd {{.*#+}} zmm29 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
13023; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm29 & (zmm3 ^ zmm7))
13024; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13025; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm7
13026; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm11 ^ ymm12))
13027; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
13028; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u,u,u]
13029; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[1,8,15],zero,zero,xmm7[4,11],zero,zero,xmm7[u,u,u,u,u,u,u]
13030; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
13031; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm8
13032; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31))
13033; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm3
13034; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm3[2],ymm8[3,4,5],ymm3[6],ymm8[7,8,9],ymm3[10],ymm8[11,12,13],ymm3[14],ymm8[15]
13035; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
13036; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ~mem)
13037; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm7
13038; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm28 ^ (ymm7 & (ymm30 ^ ymm28))
13039; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,6,13],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[u,u,u,u]
13040; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
13041; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u],zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u]
13042; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm13, %xmm7
13043; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
13044; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,4,6]
13045; AVX512DQ-FCP-NEXT:    vpermd %ymm22, %ymm13, %ymm13
13046; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
13047; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7]
13048; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u]
13049; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
13050; AVX512DQ-FCP-NEXT:    vpor %xmm13, %xmm15, %xmm13
13051; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm13, %zmm7, %zmm7
13052; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm8))
13053; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm8
13054; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm16 ^ ymm1))
13055; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm13
13056; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u]
13057; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u]
13058; AVX512DQ-FCP-NEXT:    vpor %xmm13, %xmm8, %xmm8
13059; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm13
13060; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm14 ^ (ymm13 & (ymm2 ^ ymm14))
13061; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1]
13062; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8,9,10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15]
13063; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
13064; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm8 & ymm26)
13065; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm8
13066; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm17 ^ (ymm8 & (ymm18 ^ ymm17))
13067; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero
13068; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
13069; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10]
13070; AVX512DQ-FCP-NEXT:    vpor %xmm15, %xmm8, %xmm8
13071; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
13072; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm21 & (ymm8 ^ ymm13))
13073; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm24
13074; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm29 & (zmm24 ^ zmm7))
13075; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm7
13076; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm11 ^ ymm12))
13077; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u]
13078; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
13079; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u]
13080; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
13081; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm8
13082; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31))
13083; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7,8,9,10],ymm3[11],ymm8[12,13],ymm3[14],ymm8[15]
13084; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm0
13085; AVX512DQ-FCP-NEXT:    vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13086; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
13087; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm23 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
13088; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm7 & ymm23)
13089; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm7
13090; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm28 ^ ymm30))
13091; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm8
13092; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u]
13093; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u]
13094; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
13095; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
13096; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,1,3,5,6]
13097; AVX512DQ-FCP-NEXT:    vpermd %ymm22, %ymm8, %ymm3
13098; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
13099; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,6],ymm3[7]
13100; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
13101; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
13102; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
13103; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm7, %zmm3, %zmm3
13104; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} zmm22 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
13105; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm13 ^ (zmm22 & (zmm3 ^ zmm13))
13106; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm7
13107; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm1 ^ (ymm7 & (ymm16 ^ ymm1))
13108; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm13
13109; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[6,13,u,u]
13110; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u]
13111; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm13, %xmm7
13112; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm13
13113; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm2 ^ (ymm13 & (ymm14 ^ ymm2))
13114; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1]
13115; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5,6],ymm15[7,8],ymm13[9,10],ymm15[11],ymm13[12,13,14],ymm15[15]
13116; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
13117; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm7 & ymm26)
13118; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm7
13119; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm18 ^ ymm17))
13120; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero
13121; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
13122; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11]
13123; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm15, %xmm7
13124; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
13125; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm21 & (ymm7 ^ ymm13))
13126; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm20
13127; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm29 & (zmm20 ^ zmm3))
13128; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm3
13129; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm12 ^ (ymm3 & (ymm11 ^ ymm12))
13130; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u,u,u]
13131; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm3
13132; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u,u,u]
13133; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm3, %xmm3
13134; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm7
13135; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm27 ^ ymm31))
13136; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm0[0],ymm7[1,2],ymm0[3],ymm7[4,5,6],ymm0[7,8],ymm7[9,10],ymm0[11],ymm7[12,13,14],ymm0[15]
13137; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
13138; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm23)
13139; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm3
13140; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm30 ^ (ymm3 & (ymm28 ^ ymm30))
13141; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm13
13142; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u]
13143; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u]
13144; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm13, %xmm13
13145; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
13146; AVX512DQ-FCP-NEXT:    vmovdqa 208(%rdi), %xmm0
13147; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13148; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm0, %xmm8
13149; AVX512DQ-FCP-NEXT:    vmovdqa 192(%rdi), %xmm0
13150; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13151; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
13152; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm8, %xmm3
13153; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm8
13154; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
13155; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm25 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
13156; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm25 & (ymm3 ^ ymm8))
13157; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
13158; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
13159; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
13160; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm4, %zmm3, %zmm3
13161; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm7 ^ (zmm22 & (zmm3 ^ zmm7))
13162; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm4
13163; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm2 ^ (ymm4 & (ymm14 ^ ymm2))
13164; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
13165; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3],ymm5[4],ymm4[5,6],ymm5[7,8],ymm4[9,10,11],ymm5[12],ymm4[13,14],ymm5[15]
13166; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm4, %ymm4
13167; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm5
13168; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm1 ^ (ymm5 & (ymm16 ^ ymm1))
13169; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u]
13170; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
13171; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u]
13172; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm5, %xmm5
13173; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm26) | ymm4
13174; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm4
13175; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm18 ^ (ymm4 & (ymm17 ^ ymm18))
13176; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm7
13177; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12]
13178; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero
13179; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm4, %xmm4
13180; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
13181; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm21 & (ymm4 ^ ymm5))
13182; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm22
13183; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm29 & (zmm22 ^ zmm3))
13184; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm3
13185; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm14 ^ ymm2))
13186; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
13187; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15]
13188; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
13189; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm4
13190; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm16 ^ (ymm4 & (ymm1 ^ ymm16))
13191; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
13192; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u]
13193; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[u,u]
13194; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
13195; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = (ymm4 & ymm26) | ymm3
13196; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm3
13197; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm17 ^ ymm18))
13198; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm5
13199; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13]
13200; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero
13201; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm3, %xmm3
13202; AVX512DQ-FCP-NEXT:    vinserti32x4 $1, %xmm3, %ymm0, %ymm26
13203; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm26 = ymm26 ^ (ymm21 & (ymm26 ^ ymm4))
13204; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm3
13205; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm14 ^ (ymm3 & (ymm2 ^ ymm14))
13206; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
13207; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15]
13208; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
13209; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
13210; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm4
13211; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm16 ^ (ymm4 & (ymm1 ^ ymm16))
13212; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm7
13213; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u]
13214; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u]
13215; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm4, %xmm7
13216; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = (ymm7 & ~mem) | ymm3
13217; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm3
13218; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm17 ^ ymm18))
13219; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero
13220; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm3
13221; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14]
13222; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm3, %xmm3
13223; AVX512DQ-FCP-NEXT:    vinserti32x4 $1, %xmm3, %ymm0, %ymm29
13224; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm29 = ymm29 ^ (ymm21 & (ymm29 ^ ymm7))
13225; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm13
13226; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm11 ^ (ymm13 & (ymm12 ^ ymm11))
13227; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm8
13228; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm19 = ymm17 ^ (ymm19 & (ymm18 ^ ymm17))
13229; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm3
13230; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm12 ^ ymm11))
13231; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm7
13232; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm11 ^ (ymm6 & (ymm12 ^ ymm11))
13233; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm11
13234; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u]
13235; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u]
13236; AVX512DQ-FCP-NEXT:    vpor %xmm11, %xmm12, %xmm13
13237; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31))
13238; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm11
13239; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u]
13240; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u]
13241; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm11, %xmm3
13242; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, %ymm12
13243; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm16 ^ (ymm9 & (ymm1 ^ ymm16))
13244; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm15
13245; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = ymm27 ^ (ymm15 & (ymm31 ^ ymm27))
13246; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm27 ^ (ymm9 & (ymm31 ^ ymm27))
13247; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u]
13248; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm0
13249; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u]
13250; AVX512DQ-FCP-NEXT:    vporq %xmm11, %xmm0, %xmm16
13251; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13252; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0],ymm8[1,2,3],ymm4[4],ymm8[5,6],ymm4[7,8],ymm8[9,10,11],ymm4[12],ymm8[13,14],ymm4[15]
13253; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
13254; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm13 & ymm23)
13255; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13,14,15]
13256; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
13257; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm3 & ymm23)
13258; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm4[1],ymm9[2,3,4],ymm4[5],ymm9[6,7,8],ymm4[9],ymm9[10,11,12],ymm4[13],ymm9[14,15]
13259; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
13260; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm16 & ymm23)
13261; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm10 & (ymm2 ^ ymm14))
13262; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm28 ^ ymm30))
13263; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm28 ^ ymm30))
13264; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm28 ^ (ymm10 & (ymm30 ^ ymm28))
13265; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u]
13266; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm9
13267; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10,u,u,u,u,u]
13268; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm9, %xmm3
13269; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
13270; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
13271; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm4, %xmm9
13272; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
13273; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
13274; AVX512DQ-FCP-NEXT:    vpor %xmm9, %xmm12, %xmm9
13275; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
13276; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
13277; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm25 & (ymm9 ^ ymm3))
13278; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm7[u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u]
13279; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
13280; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u]
13281; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm7, %xmm3
13282; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
13283; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
13284; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm5, %xmm5
13285; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
13286; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm7
13287; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm25 & (ymm7 ^ ymm3))
13288; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm3
13289; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u]
13290; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u]
13291; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm5, %xmm3
13292; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
13293; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm10
13294; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
13295; AVX512DQ-FCP-NEXT:    vpor %xmm12, %xmm10, %xmm10
13296; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
13297; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
13298; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm10 ^ (ymm25 & (ymm10 ^ ymm3))
13299; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,2,4,6,0,0,0,0]
13300; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %ymm12
13301; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm3, %ymm3
13302; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
13303; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm3, %zmm9, %zmm3
13304; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
13305; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm9 & (zmm3 ^ zmm11))
13306; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [1,3,4,6,0,0,0,0]
13307; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm11, %ymm11
13308; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
13309; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm11, %zmm7, %zmm7
13310; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm9 & (zmm7 ^ zmm8))
13311; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [1,3,5,6,0,0,0,0]
13312; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm8, %ymm8
13313; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
13314; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm8, %zmm10, %zmm8
13315; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm9 & (zmm8 ^ zmm0))
13316; AVX512DQ-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
13317; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
13318; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm26, %zmm0, %zmm3 {%k1}
13319; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm29, %zmm0, %zmm7 {%k1}
13320; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1]
13321; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15]
13322; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
13323; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u]
13324; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
13325; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u]
13326; AVX512DQ-FCP-NEXT:    vpor %xmm2, %xmm1, %xmm1
13327; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm0
13328; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm2
13329; AVX512DQ-FCP-NEXT:    vextracti32x4 $1, %ymm19, %xmm0
13330; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
13331; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero
13332; AVX512DQ-FCP-NEXT:    vpor %xmm0, %xmm2, %xmm0
13333; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
13334; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
13335; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13336; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm8 {%k1}
13337; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13338; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, (%rsi)
13339; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, (%rdx)
13340; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, (%rcx)
13341; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, (%r8)
13342; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, (%r9)
13343; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
13344; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, (%rax)
13345; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
13346; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, (%rax)
13347; AVX512DQ-FCP-NEXT:    popq %rax
13348; AVX512DQ-FCP-NEXT:    vzeroupper
13349; AVX512DQ-FCP-NEXT:    retq
13350;
13351; AVX512BW-LABEL: load_i8_stride7_vf64:
13352; AVX512BW:       # %bb.0:
13353; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm25
13354; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0]
13355; AVX512BW-NEXT:    vpermw %zmm25, %zmm0, %zmm18
13356; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15]
13357; AVX512BW-NEXT:    vpermw %zmm25, %zmm0, %zmm24
13358; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31]
13359; AVX512BW-NEXT:    vpermw %zmm25, %zmm0, %zmm9
13360; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31]
13361; AVX512BW-NEXT:    vpermw %zmm25, %zmm0, %zmm0
13362; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm10
13363; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm1
13364; AVX512BW-NEXT:    movw $-28382, %ax # imm = 0x9122
13365; AVX512BW-NEXT:    kmovd %eax, %k1
13366; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm10, %ymm3 {%k1}
13367; AVX512BW-NEXT:    kmovq %k1, %k2
13368; AVX512BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
13369; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm4
13370; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
13371; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u]
13372; AVX512BW-NEXT:    vporq %xmm4, %xmm3, %xmm16
13373; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
13374; AVX512BW-NEXT:    movw $992, %ax # imm = 0x3E0
13375; AVX512BW-NEXT:    kmovd %eax, %k1
13376; AVX512BW-NEXT:    vmovdqu16 %ymm0, %ymm16 {%k1}
13377; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm11
13378; AVX512BW-NEXT:    vmovdqa 160(%rdi), %ymm6
13379; AVX512BW-NEXT:    movw $8772, %ax # imm = 0x2244
13380; AVX512BW-NEXT:    kmovd %eax, %k6
13381; AVX512BW-NEXT:    vpblendmw %ymm11, %ymm6, %ymm0 {%k6}
13382; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm3
13383; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u]
13384; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u]
13385; AVX512BW-NEXT:    vpor %xmm3, %xmm0, %xmm0
13386; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
13387; AVX512BW-NEXT:    vmovdqa 192(%rdi), %xmm7
13388; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
13389; AVX512BW-NEXT:    vpshufb %xmm21, %xmm7, %xmm3
13390; AVX512BW-NEXT:    vmovdqa 208(%rdi), %xmm8
13391; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
13392; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
13393; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
13394; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
13395; AVX512BW-NEXT:    vmovdqa64 240(%rdi), %xmm26
13396; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u]
13397; AVX512BW-NEXT:    vmovdqa 224(%rdi), %xmm4
13398; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
13399; AVX512BW-NEXT:    vpor %xmm5, %xmm12, %xmm5
13400; AVX512BW-NEXT:    vinserti32x4 $2, %xmm5, %zmm0, %zmm0
13401; AVX512BW-NEXT:    movabsq $137438429184, %rax # imm = 0x1FFFF80000
13402; AVX512BW-NEXT:    kmovq %rax, %k5
13403; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm16 {%k5}
13404; AVX512BW-NEXT:    vmovdqa 288(%rdi), %ymm13
13405; AVX512BW-NEXT:    vmovdqa 256(%rdi), %ymm12
13406; AVX512BW-NEXT:    movw $9288, %ax # imm = 0x2448
13407; AVX512BW-NEXT:    kmovd %eax, %k3
13408; AVX512BW-NEXT:    vpblendmw %ymm13, %ymm12, %ymm0 {%k3}
13409; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u]
13410; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm0
13411; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u]
13412; AVX512BW-NEXT:    vporq %xmm5, %xmm0, %xmm19
13413; AVX512BW-NEXT:    vmovdqa64 352(%rdi), %ymm17
13414; AVX512BW-NEXT:    vmovdqa 320(%rdi), %ymm0
13415; AVX512BW-NEXT:    vpblendmw %ymm17, %ymm0, %ymm5 {%k6}
13416; AVX512BW-NEXT:    vpermq {{.*#+}} ymm15 = ymm5[2,3,0,1]
13417; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4,5],ymm15[6],ymm5[7,8,9],ymm15[10],ymm5[11,12,13],ymm15[14],ymm5[15]
13418; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
13419; AVX512BW-NEXT:    movw $3968, %ax # imm = 0xF80
13420; AVX512BW-NEXT:    kmovd %eax, %k7
13421; AVX512BW-NEXT:    vmovdqu16 %ymm5, %ymm19 {%k7}
13422; AVX512BW-NEXT:    vmovdqa 416(%rdi), %ymm15
13423; AVX512BW-NEXT:    vmovdqa 384(%rdi), %ymm5
13424; AVX512BW-NEXT:    movw $4644, %ax # imm = 0x1224
13425; AVX512BW-NEXT:    kmovd %eax, %k4
13426; AVX512BW-NEXT:    vpblendmw %ymm15, %ymm5, %ymm20 {%k4}
13427; AVX512BW-NEXT:    vextracti32x4 $1, %ymm20, %xmm22
13428; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9]
13429; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero
13430; AVX512BW-NEXT:    vporq %xmm22, %xmm20, %xmm20
13431; AVX512BW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
13432; AVX512BW-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
13433; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm10, %ymm22 {%k4}
13434; AVX512BW-NEXT:    vextracti32x4 $1, %ymm22, %xmm23
13435; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
13436; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[1,8,15],zero,zero,xmm22[4,11],zero,zero,xmm22[u,u,u,u,u,u,u]
13437; AVX512BW-NEXT:    vporq %xmm23, %xmm22, %xmm22
13438; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
13439; AVX512BW-NEXT:    movl $511, %edi # imm = 0x1FF
13440; AVX512BW-NEXT:    kmovd %edi, %k1
13441; AVX512BW-NEXT:    vmovdqu8 %ymm22, %ymm9 {%k1}
13442; AVX512BW-NEXT:    vpblendmw %ymm11, %ymm6, %ymm22 {%k3}
13443; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm23 = xmm22[u,u,u,6,13],zero,zero,xmm22[2,9],zero,zero,zero,xmm22[u,u,u,u]
13444; AVX512BW-NEXT:    vextracti32x4 $1, %ymm22, %xmm22
13445; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[u,u,u],zero,zero,xmm22[4,11],zero,zero,xmm22[0,7,14,u,u,u,u]
13446; AVX512BW-NEXT:    vporq %xmm23, %xmm22, %xmm22
13447; AVX512BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm14
13448; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
13449; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm23 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
13450; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm22 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3]
13451; AVX512BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm2
13452; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm2[7]
13453; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u]
13454; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
13455; AVX512BW-NEXT:    vporq %xmm14, %xmm22, %xmm14
13456; AVX512BW-NEXT:    vinserti32x4 $2, %xmm14, %zmm2, %zmm2
13457; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm9 {%k5}
13458; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm10, %ymm2 {%k6}
13459; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u]
13460; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
13461; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u]
13462; AVX512BW-NEXT:    vpor %xmm2, %xmm14, %xmm2
13463; AVX512BW-NEXT:    movl $261632, %edi # imm = 0x3FE00
13464; AVX512BW-NEXT:    kmovd %edi, %k5
13465; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13466; AVX512BW-NEXT:    vpblendmw %ymm6, %ymm11, %ymm14 {%k2}
13467; AVX512BW-NEXT:    vextracti32x4 $1, %ymm14, %xmm22
13468; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[u,u],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[1,8,15,u,u,u,u]
13469; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u]
13470; AVX512BW-NEXT:    vporq %xmm22, %xmm14, %xmm14
13471; AVX512BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
13472; AVX512BW-NEXT:    vpshufb %xmm21, %xmm8, %xmm21
13473; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
13474; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm21 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3]
13475; AVX512BW-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm3
13476; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,6],ymm3[7]
13477; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
13478; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
13479; AVX512BW-NEXT:    vporq %xmm14, %xmm21, %xmm14
13480; AVX512BW-NEXT:    vinserti32x4 $2, %xmm14, %zmm3, %zmm22
13481; AVX512BW-NEXT:    vmovdqu16 %zmm2, %zmm22 {%k1}
13482; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm10, %ymm2 {%k3}
13483; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u]
13484; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
13485; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u]
13486; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
13487; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13488; AVX512BW-NEXT:    vpblendmw %ymm6, %ymm11, %ymm3 {%k4}
13489; AVX512BW-NEXT:    vextracti32x4 $1, %ymm3, %xmm18
13490; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u]
13491; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u]
13492; AVX512BW-NEXT:    vporq %xmm18, %xmm3, %xmm3
13493; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
13494; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12]
13495; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
13496; AVX512BW-NEXT:    vporq %xmm18, %xmm21, %xmm18
13497; AVX512BW-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm18
13498; AVX512BW-NEXT:    movl $-134217728, %edi # imm = 0xF8000000
13499; AVX512BW-NEXT:    kmovd %edi, %k2
13500; AVX512BW-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
13501; AVX512BW-NEXT:    vmovdqu8 %ymm18, %ymm3 {%k2}
13502; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm18 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
13503; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
13504; AVX512BW-NEXT:    vporq %xmm18, %xmm21, %xmm18
13505; AVX512BW-NEXT:    vinserti32x4 $2, %xmm18, %zmm3, %zmm18
13506; AVX512BW-NEXT:    vmovdqu16 %zmm2, %zmm18 {%k1}
13507; AVX512BW-NEXT:    kmovd %eax, %k2
13508; AVX512BW-NEXT:    vmovdqu8 %ymm20, %ymm19 {%k2}
13509; AVX512BW-NEXT:    vinserti64x4 $1, %ymm19, %zmm0, %zmm2
13510; AVX512BW-NEXT:    movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000
13511; AVX512BW-NEXT:    kmovq %rax, %k1
13512; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm16 {%k1}
13513; AVX512BW-NEXT:    vpblendmw %ymm12, %ymm13, %ymm2 {%k4}
13514; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
13515; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u]
13516; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u]
13517; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
13518; AVX512BW-NEXT:    vpblendmw %ymm17, %ymm0, %ymm3 {%k3}
13519; AVX512BW-NEXT:    vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
13520; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7,8,9,10],ymm14[11],ymm3[12,13],ymm14[14],ymm3[15]
13521; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
13522; AVX512BW-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k7}
13523; AVX512BW-NEXT:    vpblendmw %ymm15, %ymm5, %ymm3 {%k6}
13524; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero
13525; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm3
13526; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10]
13527; AVX512BW-NEXT:    vpor %xmm3, %xmm14, %xmm3
13528; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
13529; AVX512BW-NEXT:    vmovdqu8 %ymm3, %ymm2 {%k2}
13530; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
13531; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm9 {%k1}
13532; AVX512BW-NEXT:    vpblendmw %ymm12, %ymm13, %ymm2 {%k6}
13533; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
13534; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u]
13535; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u]
13536; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
13537; AVX512BW-NEXT:    vpblendmw %ymm0, %ymm17, %ymm3 {%k4}
13538; AVX512BW-NEXT:    vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
13539; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2],ymm14[3],ymm3[4,5,6],ymm14[7,8],ymm3[9,10],ymm14[11],ymm3[12,13,14],ymm14[15]
13540; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
13541; AVX512BW-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k7}
13542; AVX512BW-NEXT:    vpblendmw %ymm15, %ymm5, %ymm3 {%k3}
13543; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero
13544; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm3
13545; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11]
13546; AVX512BW-NEXT:    vpor %xmm3, %xmm14, %xmm3
13547; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
13548; AVX512BW-NEXT:    vmovdqu8 %ymm3, %ymm2 {%k2}
13549; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
13550; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm22 {%k1}
13551; AVX512BW-NEXT:    vpblendmw %ymm12, %ymm13, %ymm2 {%k3}
13552; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u]
13553; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
13554; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u]
13555; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
13556; AVX512BW-NEXT:    vpblendmw %ymm0, %ymm17, %ymm3 {%k6}
13557; AVX512BW-NEXT:    vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
13558; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2,3],ymm14[4],ymm3[5,6],ymm14[7,8],ymm3[9,10,11],ymm14[12],ymm3[13,14],ymm14[15]
13559; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
13560; AVX512BW-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k7}
13561; AVX512BW-NEXT:    vpblendmw %ymm5, %ymm15, %ymm3 {%k4}
13562; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm14
13563; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12]
13564; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero
13565; AVX512BW-NEXT:    vpor %xmm3, %xmm14, %xmm3
13566; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
13567; AVX512BW-NEXT:    vmovdqu8 %ymm3, %ymm2 {%k2}
13568; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
13569; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm18 {%k1}
13570; AVX512BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
13571; AVX512BW-NEXT:    vpblendmw %ymm13, %ymm12, %ymm2 {%k1}
13572; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
13573; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u]
13574; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u]
13575; AVX512BW-NEXT:    vporq %xmm3, %xmm2, %xmm19
13576; AVX512BW-NEXT:    vpblendmw %ymm0, %ymm17, %ymm2 {%k3}
13577; AVX512BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
13578; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15]
13579; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
13580; AVX512BW-NEXT:    vmovdqu16 %ymm2, %ymm19 {%k7}
13581; AVX512BW-NEXT:    vpblendmw %ymm13, %ymm12, %ymm2 {%k4}
13582; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
13583; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u]
13584; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u]
13585; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
13586; AVX512BW-NEXT:    vpblendmw %ymm17, %ymm0, %ymm3 {%k1}
13587; AVX512BW-NEXT:    kmovq %k1, %k7
13588; AVX512BW-NEXT:    vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
13589; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7,8],ymm14[9],ymm3[10,11,12],ymm14[13],ymm3[14,15]
13590; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm20 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
13591; AVX512BW-NEXT:    movl $8176, %eax # imm = 0x1FF0
13592; AVX512BW-NEXT:    kmovd %eax, %k1
13593; AVX512BW-NEXT:    vmovdqu8 %ymm2, %ymm20 {%k1}
13594; AVX512BW-NEXT:    vpblendmw %ymm5, %ymm15, %ymm2 {%k6}
13595; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
13596; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13]
13597; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero
13598; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
13599; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
13600; AVX512BW-NEXT:    vmovdqu8 %ymm2, %ymm19 {%k2}
13601; AVX512BW-NEXT:    vpblendmw %ymm5, %ymm15, %ymm2 {%k3}
13602; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero
13603; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
13604; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14]
13605; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
13606; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
13607; AVX512BW-NEXT:    vmovdqu8 %ymm2, %ymm20 {%k2}
13608; AVX512BW-NEXT:    vpblendmw %ymm6, %ymm11, %ymm2 {%k6}
13609; AVX512BW-NEXT:    vpblendmw %ymm10, %ymm1, %ymm21 {%k6}
13610; AVX512BW-NEXT:    vmovdqu16 %ymm17, %ymm0 {%k4}
13611; AVX512BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
13612; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
13613; AVX512BW-NEXT:    vmovdqu16 %ymm13, %ymm12 {%k6}
13614; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u]
13615; AVX512BW-NEXT:    vextracti128 $1, %ymm12, %xmm12
13616; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u]
13617; AVX512BW-NEXT:    vpor %xmm0, %xmm12, %xmm0
13618; AVX512BW-NEXT:    movl $4186112, %eax # imm = 0x3FE000
13619; AVX512BW-NEXT:    kmovd %eax, %k1
13620; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
13621; AVX512BW-NEXT:    vmovdqu16 %ymm15, %ymm5 {%k7}
13622; AVX512BW-NEXT:    vpblendmw %ymm10, %ymm1, %ymm12 {%k4}
13623; AVX512BW-NEXT:    vpblendmw %ymm6, %ymm11, %ymm3 {%k3}
13624; AVX512BW-NEXT:    vmovdqu16 %ymm10, %ymm1 {%k3}
13625; AVX512BW-NEXT:    vmovdqu16 %ymm11, %ymm6 {%k4}
13626; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm2[u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u]
13627; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
13628; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u]
13629; AVX512BW-NEXT:    vpor %xmm2, %xmm10, %xmm2
13630; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
13631; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13]
13632; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
13633; AVX512BW-NEXT:    vpor %xmm10, %xmm11, %xmm10
13634; AVX512BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
13635; AVX512BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
13636; AVX512BW-NEXT:    vmovdqu8 %ymm10, %ymm2 {%k1}
13637; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm3[u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u]
13638; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm3
13639; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u]
13640; AVX512BW-NEXT:    vpor %xmm3, %xmm10, %xmm3
13641; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
13642; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
13643; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14]
13644; AVX512BW-NEXT:    vpor %xmm10, %xmm11, %xmm10
13645; AVX512BW-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
13646; AVX512BW-NEXT:    vmovdqu8 %ymm10, %ymm3 {%k1}
13647; AVX512BW-NEXT:    vextracti128 $1, %ymm6, %xmm10
13648; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u]
13649; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
13650; AVX512BW-NEXT:    vpor %xmm6, %xmm10, %xmm6
13651; AVX512BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
13652; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
13653; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15]
13654; AVX512BW-NEXT:    vpor %xmm7, %xmm8, %xmm7
13655; AVX512BW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
13656; AVX512BW-NEXT:    vmovdqu8 %ymm7, %ymm6 {%k1}
13657; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31]
13658; AVX512BW-NEXT:    vpermw %zmm25, %zmm7, %zmm7
13659; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31]
13660; AVX512BW-NEXT:    vpermw %zmm25, %zmm8, %zmm8
13661; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0]
13662; AVX512BW-NEXT:    vpermw %zmm25, %zmm10, %zmm10
13663; AVX512BW-NEXT:    vextracti128 $1, %ymm12, %xmm11
13664; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u]
13665; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[4,11],zero,zero,xmm12[0,7,14],zero,zero,xmm12[u,u,u,u,u,u,u]
13666; AVX512BW-NEXT:    vpor %xmm11, %xmm12, %xmm11
13667; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm11 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13668; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm10 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
13669; AVX512BW-NEXT:    vpshufb %xmm10, %xmm4, %xmm12
13670; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13671; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
13672; AVX512BW-NEXT:    vinserti64x4 $1, %ymm12, %zmm2, %zmm2
13673; AVX512BW-NEXT:    vmovdqu16 %zmm2, %zmm11 {%k5}
13674; AVX512BW-NEXT:    vinserti64x4 $1, %ymm19, %zmm0, %zmm2
13675; AVX512BW-NEXT:    movw $-512, %ax # imm = 0xFE00
13676; AVX512BW-NEXT:    vextracti32x4 $1, %ymm21, %xmm12
13677; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u]
13678; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm21[5,12],zero,zero,xmm21[1,8,15],zero,zero,xmm21[u,u,u,u,u,u,u]
13679; AVX512BW-NEXT:    vpor %xmm12, %xmm13, %xmm12
13680; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm12 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13681; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13682; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13683; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
13684; AVX512BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm3, %zmm3
13685; AVX512BW-NEXT:    vmovdqu16 %zmm3, %zmm12 {%k5}
13686; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
13687; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm1
13688; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
13689; AVX512BW-NEXT:    vpor %xmm3, %xmm1, %xmm1
13690; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13691; AVX512BW-NEXT:    vpshufb %xmm10, %xmm26, %xmm3
13692; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13693; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
13694; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm6, %zmm3
13695; AVX512BW-NEXT:    vmovdqu16 %zmm3, %zmm1 {%k5}
13696; AVX512BW-NEXT:    kmovd %eax, %k1
13697; AVX512BW-NEXT:    vmovdqa32 %zmm2, %zmm11 {%k1}
13698; AVX512BW-NEXT:    vinserti64x4 $1, %ymm20, %zmm0, %zmm2
13699; AVX512BW-NEXT:    vmovdqa32 %zmm2, %zmm12 {%k1}
13700; AVX512BW-NEXT:    vextracti128 $1, %ymm5, %xmm2
13701; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15]
13702; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero
13703; AVX512BW-NEXT:    vpor %xmm2, %xmm3, %xmm2
13704; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
13705; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15]
13706; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
13707; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
13708; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
13709; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
13710; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
13711; AVX512BW-NEXT:    vmovdqa64 %zmm16, (%rsi)
13712; AVX512BW-NEXT:    vmovdqa64 %zmm9, (%rdx)
13713; AVX512BW-NEXT:    vmovdqa64 %zmm22, (%rcx)
13714; AVX512BW-NEXT:    vmovdqa64 %zmm18, (%r8)
13715; AVX512BW-NEXT:    vmovdqa64 %zmm11, (%r9)
13716; AVX512BW-NEXT:    vmovdqa64 %zmm12, (%rdi)
13717; AVX512BW-NEXT:    vmovdqa64 %zmm1, (%rax)
13718; AVX512BW-NEXT:    vzeroupper
13719; AVX512BW-NEXT:    retq
13720;
13721; AVX512BW-FCP-LABEL: load_i8_stride7_vf64:
13722; AVX512BW-FCP:       # %bb.0:
13723; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm0
13724; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
13725; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23]
13726; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm24
13727; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7]
13728; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm13
13729; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23]
13730; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm25
13731; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15]
13732; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm12
13733; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15]
13734; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm16
13735; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31]
13736; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm8
13737; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15]
13738; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm4
13739; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31]
13740; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm5
13741; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm10
13742; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
13743; AVX512BW-FCP-NEXT:    movw $-28382, %ax # imm = 0x9122
13744; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
13745; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm1 {%k1}
13746; AVX512BW-FCP-NEXT:    kmovq %k1, %k2
13747; AVX512BW-FCP-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
13748; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm6
13749; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u,u,u]
13750; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
13751; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm1, %xmm1
13752; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
13753; AVX512BW-FCP-NEXT:    movw $992, %ax # imm = 0x3E0
13754; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
13755; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm5, %ymm1 {%k1}
13756; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm11
13757; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm9
13758; AVX512BW-FCP-NEXT:    movw $8772, %ax # imm = 0x2244
13759; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
13760; AVX512BW-FCP-NEXT:    vpblendmw %ymm11, %ymm9, %ymm5 {%k1}
13761; AVX512BW-FCP-NEXT:    kmovq %k1, %k3
13762; AVX512BW-FCP-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
13763; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
13764; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13,u,u,u,u]
13765; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,5,12],zero,zero,xmm5[1,8,15],zero,zero,xmm5[u,u,u,u]
13766; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
13767; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
13768; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,1,2,4,6]
13769; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm17
13770; AVX512BW-FCP-NEXT:    vpermd %ymm17, %ymm6, %ymm6
13771; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
13772; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
13773; AVX512BW-FCP-NEXT:    vmovdqa64 240(%rdi), %xmm19
13774; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u]
13775; AVX512BW-FCP-NEXT:    vmovdqa64 224(%rdi), %xmm20
13776; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
13777; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
13778; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm5, %zmm5
13779; AVX512BW-FCP-NEXT:    movabsq $137438429184, %rax # imm = 0x1FFFF80000
13780; AVX512BW-FCP-NEXT:    kmovq %rax, %k5
13781; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm5, %zmm1 {%k5}
13782; AVX512BW-FCP-NEXT:    vmovdqa 288(%rdi), %ymm6
13783; AVX512BW-FCP-NEXT:    vmovdqa 256(%rdi), %ymm5
13784; AVX512BW-FCP-NEXT:    movw $9288, %ax # imm = 0x2448
13785; AVX512BW-FCP-NEXT:    kmovd %eax, %k6
13786; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm5, %ymm7 {%k6}
13787; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm7[u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u]
13788; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
13789; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u]
13790; AVX512BW-FCP-NEXT:    vporq %xmm18, %xmm7, %xmm21
13791; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
13792; AVX512BW-FCP-NEXT:    movw $3968, %ax # imm = 0xF80
13793; AVX512BW-FCP-NEXT:    kmovd %eax, %k7
13794; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm4, %ymm21 {%k7}
13795; AVX512BW-FCP-NEXT:    vmovdqa 416(%rdi), %ymm7
13796; AVX512BW-FCP-NEXT:    vmovdqa 384(%rdi), %ymm4
13797; AVX512BW-FCP-NEXT:    movw $4644, %ax # imm = 0x1224
13798; AVX512BW-FCP-NEXT:    kmovd %eax, %k4
13799; AVX512BW-FCP-NEXT:    vpblendmw %ymm7, %ymm4, %ymm18 {%k4}
13800; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm22
13801; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9]
13802; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero
13803; AVX512BW-FCP-NEXT:    vporq %xmm22, %xmm18, %xmm18
13804; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm22
13805; AVX512BW-FCP-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
13806; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm18 {%k4}
13807; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm23
13808; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
13809; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u,u,u]
13810; AVX512BW-FCP-NEXT:    vporq %xmm23, %xmm18, %xmm18
13811; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
13812; AVX512BW-FCP-NEXT:    movl $511, %r10d # imm = 0x1FF
13813; AVX512BW-FCP-NEXT:    kmovd %r10d, %k1
13814; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm18, %ymm8 {%k1}
13815; AVX512BW-FCP-NEXT:    vpblendmw %ymm11, %ymm9, %ymm18 {%k6}
13816; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u]
13817; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm18
13818; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14,u,u,u,u]
13819; AVX512BW-FCP-NEXT:    vporq %xmm23, %xmm18, %xmm18
13820; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm14
13821; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [0,0,0,0,1,3,4,6]
13822; AVX512BW-FCP-NEXT:    vpermd %ymm17, %ymm18, %ymm18
13823; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
13824; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7]
13825; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u]
13826; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
13827; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm18, %xmm15
13828; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm14, %zmm14
13829; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm8 {%k5}
13830; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm14 {%k3}
13831; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u]
13832; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
13833; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u]
13834; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
13835; AVX512BW-FCP-NEXT:    movl $261632, %r10d # imm = 0x3FE00
13836; AVX512BW-FCP-NEXT:    kmovd %r10d, %k5
13837; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13838; AVX512BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm12 {%k2}
13839; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm15
13840; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u]
13841; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
13842; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm12, %xmm12
13843; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
13844; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,5,6]
13845; AVX512BW-FCP-NEXT:    vpermd %ymm17, %ymm15, %ymm15
13846; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
13847; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7]
13848; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
13849; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
13850; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm17, %xmm15
13851; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm12, %zmm12
13852; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm14, %zmm12 {%k1}
13853; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm14 {%k6}
13854; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
13855; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
13856; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
13857; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
13858; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13859; AVX512BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm13 {%k4}
13860; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm15
13861; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u]
13862; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u]
13863; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm13, %xmm13
13864; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
13865; AVX512BW-FCP-NEXT:    vmovdqa64 208(%rdi), %xmm17
13866; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[5,12]
13867; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %xmm18
13868; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
13869; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm23, %xmm15
13870; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
13871; AVX512BW-FCP-NEXT:    movl $-134217728, %r10d # imm = 0xF8000000
13872; AVX512BW-FCP-NEXT:    kmovd %r10d, %k2
13873; AVX512BW-FCP-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
13874; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm13 {%k2}
13875; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
13876; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = zero,zero,xmm19[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
13877; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm19, %xmm15
13878; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm13, %zmm13
13879; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm14, %zmm13 {%k1}
13880; AVX512BW-FCP-NEXT:    kmovd %eax, %k3
13881; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm22, %ymm21 {%k3}
13882; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm21, %zmm0, %zmm14
13883; AVX512BW-FCP-NEXT:    movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000
13884; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
13885; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm1 {%k2}
13886; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm14 {%k4}
13887; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
13888; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u]
13889; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u]
13890; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
13891; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
13892; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm15, %ymm14 {%k7}
13893; AVX512BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
13894; AVX512BW-FCP-NEXT:    vpblendmw %ymm7, %ymm4, %ymm15 {%k1}
13895; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero
13896; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
13897; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10]
13898; AVX512BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
13899; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
13900; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm14 {%k3}
13901; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
13902; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm8 {%k2}
13903; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm14 {%k1}
13904; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
13905; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u]
13906; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u]
13907; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
13908; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
13909; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm15, %ymm14 {%k7}
13910; AVX512BW-FCP-NEXT:    vpblendmw %ymm7, %ymm4, %ymm15 {%k6}
13911; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero
13912; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
13913; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11]
13914; AVX512BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
13915; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
13916; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm14 {%k3}
13917; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
13918; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm12 {%k2}
13919; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm14 {%k6}
13920; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u]
13921; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
13922; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u]
13923; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
13924; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
13925; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm15, %ymm14 {%k7}
13926; AVX512BW-FCP-NEXT:    vpblendmw %ymm4, %ymm7, %ymm15 {%k4}
13927; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm15, %xmm16
13928; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12]
13929; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero
13930; AVX512BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
13931; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
13932; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm14 {%k3}
13933; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
13934; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm13 {%k2}
13935; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31]
13936; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm14, %zmm15
13937; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31]
13938; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm14, %zmm16
13939; AVX512BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
13940; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm5, %ymm14 {%k2}
13941; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm14, %xmm19
13942; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u]
13943; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u]
13944; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm14, %xmm14
13945; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
13946; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm16, %ymm14 {%k7}
13947; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm5, %ymm16 {%k4}
13948; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm19
13949; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u]
13950; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u]
13951; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm16, %xmm16
13952; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
13953; AVX512BW-FCP-NEXT:    movl $8176, %eax # imm = 0x1FF0
13954; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
13955; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k1}
13956; AVX512BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
13957; AVX512BW-FCP-NEXT:    vpblendmw %ymm4, %ymm7, %ymm16 {%k1}
13958; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm19
13959; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13]
13960; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero
13961; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm16, %xmm16
13962; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
13963; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm14 {%k3}
13964; AVX512BW-FCP-NEXT:    vpblendmw %ymm4, %ymm7, %ymm16 {%k6}
13965; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero
13966; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm16
13967; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14]
13968; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm16, %xmm16
13969; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
13970; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k3}
13971; AVX512BW-FCP-NEXT:    vpblendmw %ymm10, %ymm3, %ymm19 {%k4}
13972; AVX512BW-FCP-NEXT:    vpblendmw %ymm10, %ymm3, %ymm16 {%k1}
13973; AVX512BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm20 {%k6}
13974; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm10, %ymm3 {%k6}
13975; AVX512BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm10 {%k1}
13976; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm11, %ymm9 {%k4}
13977; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u]
13978; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm10
13979; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u]
13980; AVX512BW-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
13981; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
13982; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13]
13983; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
13984; AVX512BW-FCP-NEXT:    vporq %xmm11, %xmm21, %xmm11
13985; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
13986; AVX512BW-FCP-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload
13987; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm11, %ymm10 {%k3}
13988; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u]
13989; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm20, %xmm20
13990; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[1,8,15],zero,zero,xmm20[4,11,u,u,u,u,u]
13991; AVX512BW-FCP-NEXT:    vporq %xmm11, %xmm20, %xmm11
13992; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
13993; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
13994; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14]
13995; AVX512BW-FCP-NEXT:    vporq %xmm20, %xmm21, %xmm20
13996; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
13997; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm20, %ymm11 {%k3}
13998; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm9, %xmm20
13999; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u]
14000; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u]
14001; AVX512BW-FCP-NEXT:    vporq %xmm20, %xmm9, %xmm9
14002; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
14003; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
14004; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15]
14005; AVX512BW-FCP-NEXT:    vporq %xmm18, %xmm17, %xmm17
14006; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm17
14007; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm9 {%k3}
14008; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm17 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
14009; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm17, %zmm17
14010; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm18 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
14011; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm18, %zmm18
14012; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm20 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
14013; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm20, %zmm20
14014; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm19, %xmm2
14015; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u]
14016; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u]
14017; AVX512BW-FCP-NEXT:    vporq %xmm2, %xmm19, %xmm2
14018; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14019; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [1,2,4,6,0,0,0,0]
14020; AVX512BW-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm20
14021; AVX512BW-FCP-NEXT:    vpermd %ymm20, %ymm19, %ymm19
14022; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
14023; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm19, %zmm10, %zmm10
14024; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm10, %zmm2 {%k5}
14025; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm10
14026; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u]
14027; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u]
14028; AVX512BW-FCP-NEXT:    vporq %xmm10, %xmm16, %xmm10
14029; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm10 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14030; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [1,3,4,6,0,0,0,0]
14031; AVX512BW-FCP-NEXT:    vpermd %ymm20, %ymm16, %ymm16
14032; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
14033; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm16, %zmm11, %zmm11
14034; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm11, %zmm10 {%k5}
14035; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
14036; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm3
14037; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u]
14038; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm11, %xmm3
14039; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14040; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [1,3,5,6,0,0,0,0]
14041; AVX512BW-FCP-NEXT:    vpermd %ymm20, %ymm11, %ymm11
14042; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
14043; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm11, %zmm9, %zmm9
14044; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm9, %zmm3 {%k5}
14045; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15]
14046; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm9, %zmm0
14047; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm5 {%k1}
14048; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u]
14049; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
14050; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u]
14051; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
14052; AVX512BW-FCP-NEXT:    movl $4186112, %eax # imm = 0x3FE000
14053; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
14054; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
14055; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm0
14056; AVX512BW-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
14057; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm7, %ymm4 {%k2}
14058; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
14059; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
14060; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm0
14061; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm10 {%k1}
14062; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm0
14063; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
14064; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero
14065; AVX512BW-FCP-NEXT:    vpor %xmm0, %xmm4, %xmm0
14066; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
14067; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7],ymm5[8,9,10],ymm0[11,12,13,14,15]
14068; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
14069; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
14070; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
14071; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
14072; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
14073; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, (%rsi)
14074; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, (%rdx)
14075; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, (%rcx)
14076; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, (%r8)
14077; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, (%r9)
14078; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, (%rdi)
14079; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
14080; AVX512BW-FCP-NEXT:    vzeroupper
14081; AVX512BW-FCP-NEXT:    retq
14082;
14083; AVX512DQ-BW-LABEL: load_i8_stride7_vf64:
14084; AVX512DQ-BW:       # %bb.0:
14085; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm25
14086; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0]
14087; AVX512DQ-BW-NEXT:    vpermw %zmm25, %zmm0, %zmm18
14088; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15]
14089; AVX512DQ-BW-NEXT:    vpermw %zmm25, %zmm0, %zmm24
14090; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31]
14091; AVX512DQ-BW-NEXT:    vpermw %zmm25, %zmm0, %zmm10
14092; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31]
14093; AVX512DQ-BW-NEXT:    vpermw %zmm25, %zmm0, %zmm0
14094; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm9
14095; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm1
14096; AVX512DQ-BW-NEXT:    movw $-28382, %ax # imm = 0x9122
14097; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
14098; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm9, %ymm3 {%k1}
14099; AVX512DQ-BW-NEXT:    kmovq %k1, %k2
14100; AVX512DQ-BW-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
14101; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm4
14102; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
14103; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u]
14104; AVX512DQ-BW-NEXT:    vporq %xmm4, %xmm3, %xmm16
14105; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
14106; AVX512DQ-BW-NEXT:    movw $992, %ax # imm = 0x3E0
14107; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
14108; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm0, %ymm16 {%k1}
14109; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %ymm11
14110; AVX512DQ-BW-NEXT:    vmovdqa 160(%rdi), %ymm6
14111; AVX512DQ-BW-NEXT:    movw $8772, %ax # imm = 0x2244
14112; AVX512DQ-BW-NEXT:    kmovd %eax, %k6
14113; AVX512DQ-BW-NEXT:    vpblendmw %ymm11, %ymm6, %ymm0 {%k6}
14114; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm0, %xmm3
14115; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u]
14116; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u]
14117; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm0, %xmm0
14118; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
14119; AVX512DQ-BW-NEXT:    vmovdqa 192(%rdi), %xmm7
14120; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
14121; AVX512DQ-BW-NEXT:    vpshufb %xmm21, %xmm7, %xmm3
14122; AVX512DQ-BW-NEXT:    vmovdqa 208(%rdi), %xmm8
14123; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
14124; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
14125; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
14126; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
14127; AVX512DQ-BW-NEXT:    vmovdqa64 240(%rdi), %xmm26
14128; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u]
14129; AVX512DQ-BW-NEXT:    vmovdqa 224(%rdi), %xmm4
14130; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
14131; AVX512DQ-BW-NEXT:    vpor %xmm5, %xmm12, %xmm5
14132; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm5, %zmm0, %zmm0
14133; AVX512DQ-BW-NEXT:    movabsq $137438429184, %rax # imm = 0x1FFFF80000
14134; AVX512DQ-BW-NEXT:    kmovq %rax, %k5
14135; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm16 {%k5}
14136; AVX512DQ-BW-NEXT:    vmovdqa 288(%rdi), %ymm13
14137; AVX512DQ-BW-NEXT:    vmovdqa 256(%rdi), %ymm12
14138; AVX512DQ-BW-NEXT:    movw $9288, %ax # imm = 0x2448
14139; AVX512DQ-BW-NEXT:    kmovd %eax, %k3
14140; AVX512DQ-BW-NEXT:    vpblendmw %ymm13, %ymm12, %ymm0 {%k3}
14141; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u]
14142; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm0, %xmm0
14143; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u]
14144; AVX512DQ-BW-NEXT:    vporq %xmm5, %xmm0, %xmm19
14145; AVX512DQ-BW-NEXT:    vmovdqa64 352(%rdi), %ymm17
14146; AVX512DQ-BW-NEXT:    vmovdqa 320(%rdi), %ymm0
14147; AVX512DQ-BW-NEXT:    vpblendmw %ymm17, %ymm0, %ymm5 {%k6}
14148; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm15 = ymm5[2,3,0,1]
14149; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4,5],ymm15[6],ymm5[7,8,9],ymm15[10],ymm5[11,12,13],ymm15[14],ymm5[15]
14150; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
14151; AVX512DQ-BW-NEXT:    movw $3968, %ax # imm = 0xF80
14152; AVX512DQ-BW-NEXT:    kmovd %eax, %k7
14153; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm5, %ymm19 {%k7}
14154; AVX512DQ-BW-NEXT:    vmovdqa 416(%rdi), %ymm15
14155; AVX512DQ-BW-NEXT:    vmovdqa 384(%rdi), %ymm5
14156; AVX512DQ-BW-NEXT:    movw $4644, %ax # imm = 0x1224
14157; AVX512DQ-BW-NEXT:    kmovd %eax, %k4
14158; AVX512DQ-BW-NEXT:    vpblendmw %ymm15, %ymm5, %ymm20 {%k4}
14159; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm20, %xmm22
14160; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9]
14161; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero
14162; AVX512DQ-BW-NEXT:    vporq %xmm22, %xmm20, %xmm20
14163; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
14164; AVX512DQ-BW-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
14165; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm9, %ymm22 {%k4}
14166; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm22, %xmm23
14167; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
14168; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[1,8,15],zero,zero,xmm22[4,11],zero,zero,xmm22[u,u,u,u,u,u,u]
14169; AVX512DQ-BW-NEXT:    vporq %xmm23, %xmm22, %xmm22
14170; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
14171; AVX512DQ-BW-NEXT:    movl $511, %edi # imm = 0x1FF
14172; AVX512DQ-BW-NEXT:    kmovd %edi, %k1
14173; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm22, %ymm10 {%k1}
14174; AVX512DQ-BW-NEXT:    vpblendmw %ymm11, %ymm6, %ymm22 {%k3}
14175; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm23 = xmm22[u,u,u,6,13],zero,zero,xmm22[2,9],zero,zero,zero,xmm22[u,u,u,u]
14176; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm22, %xmm22
14177; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[u,u,u],zero,zero,xmm22[4,11],zero,zero,xmm22[0,7,14,u,u,u,u]
14178; AVX512DQ-BW-NEXT:    vporq %xmm23, %xmm22, %xmm22
14179; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm14
14180; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
14181; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm23 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
14182; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm22 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3]
14183; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm22, %ymm0, %ymm2
14184; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm2[7]
14185; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u]
14186; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
14187; AVX512DQ-BW-NEXT:    vporq %xmm14, %xmm22, %xmm14
14188; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm14, %zmm2, %zmm2
14189; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm2, %zmm10 {%k5}
14190; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm9, %ymm2 {%k6}
14191; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u]
14192; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
14193; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u]
14194; AVX512DQ-BW-NEXT:    vpor %xmm2, %xmm14, %xmm2
14195; AVX512DQ-BW-NEXT:    movl $261632, %edi # imm = 0x3FE00
14196; AVX512DQ-BW-NEXT:    kmovd %edi, %k5
14197; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14198; AVX512DQ-BW-NEXT:    vpblendmw %ymm6, %ymm11, %ymm14 {%k2}
14199; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm14, %xmm22
14200; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[u,u],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[1,8,15,u,u,u,u]
14201; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u]
14202; AVX512DQ-BW-NEXT:    vporq %xmm22, %xmm14, %xmm14
14203; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
14204; AVX512DQ-BW-NEXT:    vpshufb %xmm21, %xmm8, %xmm21
14205; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm22 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
14206; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm21 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3]
14207; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm3
14208; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,6],ymm3[7]
14209; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
14210; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
14211; AVX512DQ-BW-NEXT:    vporq %xmm14, %xmm21, %xmm14
14212; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm14, %zmm3, %zmm23
14213; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm2, %zmm23 {%k1}
14214; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm9, %ymm2 {%k3}
14215; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u]
14216; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
14217; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u]
14218; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
14219; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14220; AVX512DQ-BW-NEXT:    vpblendmw %ymm6, %ymm11, %ymm3 {%k4}
14221; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm3, %xmm18
14222; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u]
14223; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u]
14224; AVX512DQ-BW-NEXT:    vporq %xmm18, %xmm3, %xmm3
14225; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
14226; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12]
14227; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
14228; AVX512DQ-BW-NEXT:    vporq %xmm18, %xmm21, %xmm18
14229; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm18
14230; AVX512DQ-BW-NEXT:    movl $-134217728, %edi # imm = 0xF8000000
14231; AVX512DQ-BW-NEXT:    kmovd %edi, %k2
14232; AVX512DQ-BW-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
14233; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm18, %ymm3 {%k2}
14234; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm18 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
14235; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
14236; AVX512DQ-BW-NEXT:    vporq %xmm18, %xmm21, %xmm18
14237; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm18, %zmm3, %zmm18
14238; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm2, %zmm18 {%k1}
14239; AVX512DQ-BW-NEXT:    kmovd %eax, %k2
14240; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm20, %ymm19 {%k2}
14241; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm19, %zmm0, %zmm2
14242; AVX512DQ-BW-NEXT:    movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000
14243; AVX512DQ-BW-NEXT:    kmovq %rax, %k1
14244; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm2, %zmm16 {%k1}
14245; AVX512DQ-BW-NEXT:    vpblendmw %ymm12, %ymm13, %ymm2 {%k4}
14246; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
14247; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u]
14248; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u]
14249; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
14250; AVX512DQ-BW-NEXT:    vpblendmw %ymm17, %ymm0, %ymm3 {%k3}
14251; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
14252; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7,8,9,10],ymm14[11],ymm3[12,13],ymm14[14],ymm3[15]
14253; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
14254; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k7}
14255; AVX512DQ-BW-NEXT:    vpblendmw %ymm15, %ymm5, %ymm3 {%k6}
14256; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero
14257; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm3
14258; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10]
14259; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm14, %xmm3
14260; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
14261; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm3, %ymm2 {%k2}
14262; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
14263; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm2, %zmm10 {%k1}
14264; AVX512DQ-BW-NEXT:    vpblendmw %ymm12, %ymm13, %ymm2 {%k6}
14265; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
14266; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u]
14267; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u]
14268; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
14269; AVX512DQ-BW-NEXT:    vpblendmw %ymm0, %ymm17, %ymm3 {%k4}
14270; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
14271; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2],ymm14[3],ymm3[4,5,6],ymm14[7,8],ymm3[9,10],ymm14[11],ymm3[12,13,14],ymm14[15]
14272; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
14273; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k7}
14274; AVX512DQ-BW-NEXT:    vpblendmw %ymm15, %ymm5, %ymm3 {%k3}
14275; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero
14276; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm3
14277; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11]
14278; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm14, %xmm3
14279; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
14280; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm3, %ymm2 {%k2}
14281; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
14282; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm2, %zmm23 {%k1}
14283; AVX512DQ-BW-NEXT:    vpblendmw %ymm12, %ymm13, %ymm2 {%k3}
14284; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u]
14285; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
14286; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u]
14287; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
14288; AVX512DQ-BW-NEXT:    vpblendmw %ymm0, %ymm17, %ymm3 {%k6}
14289; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
14290; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2,3],ymm14[4],ymm3[5,6],ymm14[7,8],ymm3[9,10,11],ymm14[12],ymm3[13,14],ymm14[15]
14291; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
14292; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k7}
14293; AVX512DQ-BW-NEXT:    vpblendmw %ymm5, %ymm15, %ymm3 {%k4}
14294; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm14
14295; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12]
14296; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero
14297; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm14, %xmm3
14298; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
14299; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm3, %ymm2 {%k2}
14300; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm2
14301; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm2, %zmm18 {%k1}
14302; AVX512DQ-BW-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
14303; AVX512DQ-BW-NEXT:    vpblendmw %ymm13, %ymm12, %ymm2 {%k1}
14304; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
14305; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u]
14306; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u]
14307; AVX512DQ-BW-NEXT:    vporq %xmm3, %xmm2, %xmm19
14308; AVX512DQ-BW-NEXT:    vpblendmw %ymm0, %ymm17, %ymm2 {%k3}
14309; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
14310; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15]
14311; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
14312; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm2, %ymm19 {%k7}
14313; AVX512DQ-BW-NEXT:    vpblendmw %ymm13, %ymm12, %ymm2 {%k4}
14314; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
14315; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u]
14316; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u]
14317; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
14318; AVX512DQ-BW-NEXT:    vpblendmw %ymm17, %ymm0, %ymm3 {%k1}
14319; AVX512DQ-BW-NEXT:    kmovq %k1, %k7
14320; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
14321; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7,8],ymm14[9],ymm3[10,11,12],ymm14[13],ymm3[14,15]
14322; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm20 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
14323; AVX512DQ-BW-NEXT:    movl $8176, %eax # imm = 0x1FF0
14324; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
14325; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm2, %ymm20 {%k1}
14326; AVX512DQ-BW-NEXT:    vpblendmw %ymm5, %ymm15, %ymm2 {%k6}
14327; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
14328; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13]
14329; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero
14330; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
14331; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
14332; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm2, %ymm19 {%k2}
14333; AVX512DQ-BW-NEXT:    vpblendmw %ymm5, %ymm15, %ymm2 {%k3}
14334; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero
14335; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
14336; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14]
14337; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
14338; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
14339; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm2, %ymm20 {%k2}
14340; AVX512DQ-BW-NEXT:    vpblendmw %ymm6, %ymm11, %ymm22 {%k6}
14341; AVX512DQ-BW-NEXT:    vpblendmw %ymm9, %ymm1, %ymm21 {%k6}
14342; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm17, %ymm0 {%k4}
14343; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
14344; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15]
14345; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm13, %ymm12 {%k6}
14346; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u]
14347; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm12, %xmm3
14348; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u]
14349; AVX512DQ-BW-NEXT:    vpor %xmm0, %xmm3, %xmm0
14350; AVX512DQ-BW-NEXT:    movl $4186112, %eax # imm = 0x3FE000
14351; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
14352; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
14353; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm15, %ymm5 {%k7}
14354; AVX512DQ-BW-NEXT:    vpblendmw %ymm9, %ymm1, %ymm2 {%k4}
14355; AVX512DQ-BW-NEXT:    vpblendmw %ymm6, %ymm11, %ymm3 {%k3}
14356; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm9, %ymm1 {%k3}
14357; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm11, %ymm6 {%k4}
14358; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm22[u,u,2,9],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[u,u,u,u,u]
14359; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm22, %xmm11
14360; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u]
14361; AVX512DQ-BW-NEXT:    vpor %xmm9, %xmm11, %xmm9
14362; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
14363; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13]
14364; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
14365; AVX512DQ-BW-NEXT:    vpor %xmm11, %xmm12, %xmm11
14366; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
14367; AVX512DQ-BW-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
14368; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm11, %ymm9 {%k1}
14369; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm3[u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u]
14370; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm3, %xmm3
14371; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u]
14372; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm11, %xmm3
14373; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
14374; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
14375; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14]
14376; AVX512DQ-BW-NEXT:    vpor %xmm11, %xmm12, %xmm11
14377; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
14378; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm11, %ymm3 {%k1}
14379; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm6, %xmm11
14380; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u]
14381; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
14382; AVX512DQ-BW-NEXT:    vpor %xmm6, %xmm11, %xmm6
14383; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
14384; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
14385; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15]
14386; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm8, %xmm7
14387; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
14388; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm7, %ymm6 {%k1}
14389; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31]
14390; AVX512DQ-BW-NEXT:    vpermw %zmm25, %zmm7, %zmm7
14391; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31]
14392; AVX512DQ-BW-NEXT:    vpermw %zmm25, %zmm8, %zmm8
14393; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0]
14394; AVX512DQ-BW-NEXT:    vpermw %zmm25, %zmm11, %zmm11
14395; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm12
14396; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u]
14397; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u]
14398; AVX512DQ-BW-NEXT:    vpor %xmm2, %xmm12, %xmm2
14399; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14400; AVX512DQ-BW-NEXT:    vpbroadcastw {{.*#+}} xmm11 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
14401; AVX512DQ-BW-NEXT:    vpshufb %xmm11, %xmm4, %xmm12
14402; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14403; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
14404; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm12, %zmm9, %zmm9
14405; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm9, %zmm2 {%k5}
14406; AVX512DQ-BW-NEXT:    movw $-512, %ax # imm = 0xFE00
14407; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm21, %xmm9
14408; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u]
14409; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm21[5,12],zero,zero,xmm21[1,8,15],zero,zero,xmm21[u,u,u,u,u,u,u]
14410; AVX512DQ-BW-NEXT:    vpor %xmm9, %xmm12, %xmm9
14411; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm9 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14412; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14413; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14414; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3]
14415; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm3, %zmm3
14416; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm3, %zmm9 {%k5}
14417; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
14418; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm1, %xmm1
14419; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
14420; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm1, %xmm1
14421; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm1 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14422; AVX512DQ-BW-NEXT:    vpshufb %xmm11, %xmm26, %xmm3
14423; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14424; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
14425; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm6, %zmm3
14426; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm3, %zmm1 {%k5}
14427; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
14428; AVX512DQ-BW-NEXT:    vinserti32x8 $1, %ymm19, %zmm0, %zmm2 {%k1}
14429; AVX512DQ-BW-NEXT:    vinserti32x8 $1, %ymm20, %zmm0, %zmm9 {%k1}
14430; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm5, %xmm3
14431; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15]
14432; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero
14433; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm4, %xmm3
14434; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
14435; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15]
14436; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
14437; AVX512DQ-BW-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
14438; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
14439; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
14440; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, (%rsi)
14441; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, (%rdx)
14442; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm23, (%rcx)
14443; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, (%r8)
14444; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, (%r9)
14445; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, (%rdi)
14446; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, (%rax)
14447; AVX512DQ-BW-NEXT:    vzeroupper
14448; AVX512DQ-BW-NEXT:    retq
14449;
14450; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf64:
14451; AVX512DQ-BW-FCP:       # %bb.0:
14452; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm0
14453; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
14454; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23]
14455; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm24
14456; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7]
14457; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm13
14458; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23]
14459; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm25
14460; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15]
14461; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm12
14462; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15]
14463; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm16
14464; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31]
14465; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm8
14466; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15]
14467; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm5
14468; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31]
14469; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm4
14470; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm10
14471; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
14472; AVX512DQ-BW-FCP-NEXT:    movw $-28382, %ax # imm = 0x9122
14473; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
14474; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm1 {%k1}
14475; AVX512DQ-BW-FCP-NEXT:    kmovq %k1, %k2
14476; AVX512DQ-BW-FCP-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
14477; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm6
14478; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u,u,u]
14479; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
14480; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm1, %xmm1
14481; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
14482; AVX512DQ-BW-FCP-NEXT:    movw $992, %ax # imm = 0x3E0
14483; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
14484; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm4, %ymm1 {%k1}
14485; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm11
14486; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm9
14487; AVX512DQ-BW-FCP-NEXT:    movw $8772, %ax # imm = 0x2244
14488; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
14489; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm11, %ymm9, %ymm4 {%k1}
14490; AVX512DQ-BW-FCP-NEXT:    kmovq %k1, %k3
14491; AVX512DQ-BW-FCP-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
14492; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm6
14493; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13,u,u,u,u]
14494; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u]
14495; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm4, %xmm4
14496; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
14497; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,1,2,4,6]
14498; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm17
14499; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm17, %ymm6, %ymm6
14500; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
14501; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7]
14502; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 240(%rdi), %xmm19
14503; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u]
14504; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 224(%rdi), %xmm20
14505; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
14506; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
14507; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm4, %zmm4
14508; AVX512DQ-BW-FCP-NEXT:    movabsq $137438429184, %rax # imm = 0x1FFFF80000
14509; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k5
14510; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm1 {%k5}
14511; AVX512DQ-BW-FCP-NEXT:    vmovdqa 288(%rdi), %ymm6
14512; AVX512DQ-BW-FCP-NEXT:    vmovdqa 256(%rdi), %ymm4
14513; AVX512DQ-BW-FCP-NEXT:    movw $9288, %ax # imm = 0x2448
14514; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k6
14515; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm4, %ymm7 {%k6}
14516; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm7[u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u]
14517; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
14518; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u]
14519; AVX512DQ-BW-FCP-NEXT:    vporq %xmm18, %xmm7, %xmm21
14520; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
14521; AVX512DQ-BW-FCP-NEXT:    movw $3968, %ax # imm = 0xF80
14522; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k7
14523; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm5, %ymm21 {%k7}
14524; AVX512DQ-BW-FCP-NEXT:    vmovdqa 416(%rdi), %ymm7
14525; AVX512DQ-BW-FCP-NEXT:    vmovdqa 384(%rdi), %ymm5
14526; AVX512DQ-BW-FCP-NEXT:    movw $4644, %ax # imm = 0x1224
14527; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k4
14528; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm7, %ymm5, %ymm18 {%k4}
14529; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm22
14530; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9]
14531; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero
14532; AVX512DQ-BW-FCP-NEXT:    vporq %xmm22, %xmm18, %xmm18
14533; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm22
14534; AVX512DQ-BW-FCP-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
14535; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm18 {%k4}
14536; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm23
14537; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
14538; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u,u,u]
14539; AVX512DQ-BW-FCP-NEXT:    vporq %xmm23, %xmm18, %xmm18
14540; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
14541; AVX512DQ-BW-FCP-NEXT:    movl $511, %r10d # imm = 0x1FF
14542; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k1
14543; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm18, %ymm8 {%k1}
14544; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm11, %ymm9, %ymm18 {%k6}
14545; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u]
14546; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm18
14547; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14,u,u,u,u]
14548; AVX512DQ-BW-FCP-NEXT:    vporq %xmm23, %xmm18, %xmm18
14549; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm14
14550; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [0,0,0,0,1,3,4,6]
14551; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm17, %ymm18, %ymm18
14552; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
14553; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7]
14554; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u]
14555; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
14556; AVX512DQ-BW-FCP-NEXT:    vporq %xmm15, %xmm18, %xmm15
14557; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm14, %zmm14
14558; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm8 {%k5}
14559; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm14 {%k3}
14560; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u]
14561; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
14562; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u]
14563; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
14564; AVX512DQ-BW-FCP-NEXT:    movl $261632, %r10d # imm = 0x3FE00
14565; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k5
14566; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14567; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm12 {%k2}
14568; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm15
14569; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u]
14570; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
14571; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm12, %xmm12
14572; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
14573; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,5,6]
14574; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm17, %ymm15, %ymm15
14575; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
14576; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7]
14577; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
14578; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
14579; AVX512DQ-BW-FCP-NEXT:    vporq %xmm15, %xmm17, %xmm15
14580; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm12, %zmm12
14581; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm14, %zmm12 {%k1}
14582; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm14 {%k6}
14583; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
14584; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
14585; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
14586; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
14587; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14588; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm13 {%k4}
14589; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm15
14590; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u]
14591; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u]
14592; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm13, %xmm13
14593; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
14594; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 208(%rdi), %xmm17
14595; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[5,12]
14596; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %xmm18
14597; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
14598; AVX512DQ-BW-FCP-NEXT:    vporq %xmm15, %xmm23, %xmm15
14599; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
14600; AVX512DQ-BW-FCP-NEXT:    movl $-134217728, %r10d # imm = 0xF8000000
14601; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k2
14602; AVX512DQ-BW-FCP-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
14603; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm13 {%k2}
14604; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
14605; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = zero,zero,xmm19[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
14606; AVX512DQ-BW-FCP-NEXT:    vporq %xmm15, %xmm19, %xmm15
14607; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm13, %zmm13
14608; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm14, %zmm13 {%k1}
14609; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k3
14610; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm22, %ymm21 {%k3}
14611; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm21, %zmm0, %zmm14
14612; AVX512DQ-BW-FCP-NEXT:    movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000
14613; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k2
14614; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm1 {%k2}
14615; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm4, %ymm6, %ymm14 {%k4}
14616; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
14617; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u]
14618; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u]
14619; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
14620; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
14621; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm15, %ymm14 {%k7}
14622; AVX512DQ-BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
14623; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm7, %ymm5, %ymm15 {%k1}
14624; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero
14625; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
14626; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10]
14627; AVX512DQ-BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
14628; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
14629; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm14 {%k3}
14630; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
14631; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm8 {%k2}
14632; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm4, %ymm6, %ymm14 {%k1}
14633; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
14634; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u]
14635; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u]
14636; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
14637; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
14638; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm15, %ymm14 {%k7}
14639; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm7, %ymm5, %ymm15 {%k6}
14640; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero
14641; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
14642; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11]
14643; AVX512DQ-BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
14644; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
14645; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm14 {%k3}
14646; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
14647; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm12 {%k2}
14648; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm4, %ymm6, %ymm14 {%k6}
14649; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u]
14650; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
14651; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u]
14652; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
14653; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
14654; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm15, %ymm14 {%k7}
14655; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm7, %ymm15 {%k4}
14656; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm15, %xmm16
14657; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12]
14658; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero
14659; AVX512DQ-BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
14660; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
14661; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm14 {%k3}
14662; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
14663; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm13 {%k2}
14664; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31]
14665; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm14, %zmm15
14666; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31]
14667; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm14, %zmm16
14668; AVX512DQ-BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
14669; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm4, %ymm14 {%k1}
14670; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm14, %xmm19
14671; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u]
14672; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u]
14673; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm14, %xmm14
14674; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
14675; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm16, %ymm14 {%k7}
14676; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm4, %ymm16 {%k4}
14677; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm19
14678; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u]
14679; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u]
14680; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm16, %xmm16
14681; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
14682; AVX512DQ-BW-FCP-NEXT:    movl $8176, %eax # imm = 0x1FF0
14683; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
14684; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k1}
14685; AVX512DQ-BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
14686; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm7, %ymm16 {%k1}
14687; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm19
14688; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13]
14689; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero
14690; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm16, %xmm16
14691; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
14692; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm14 {%k3}
14693; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm7, %ymm16 {%k6}
14694; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero
14695; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm16
14696; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14]
14697; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm16, %xmm16
14698; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
14699; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k3}
14700; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm10, %ymm3, %ymm19 {%k4}
14701; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm10, %ymm3, %ymm16 {%k1}
14702; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm20 {%k6}
14703; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm10, %ymm3 {%k6}
14704; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm10 {%k1}
14705; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm11, %ymm9 {%k4}
14706; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u]
14707; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm10
14708; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u]
14709; AVX512DQ-BW-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
14710; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
14711; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13]
14712; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
14713; AVX512DQ-BW-FCP-NEXT:    vporq %xmm11, %xmm21, %xmm11
14714; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
14715; AVX512DQ-BW-FCP-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
14716; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm11, %ymm10 {%k2}
14717; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u]
14718; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm20, %xmm20
14719; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[1,8,15],zero,zero,xmm20[4,11,u,u,u,u,u]
14720; AVX512DQ-BW-FCP-NEXT:    vporq %xmm11, %xmm20, %xmm11
14721; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
14722; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
14723; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14]
14724; AVX512DQ-BW-FCP-NEXT:    vporq %xmm20, %xmm21, %xmm20
14725; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
14726; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm20, %ymm11 {%k2}
14727; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm9, %xmm20
14728; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u]
14729; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u]
14730; AVX512DQ-BW-FCP-NEXT:    vporq %xmm20, %xmm9, %xmm9
14731; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
14732; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
14733; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15]
14734; AVX512DQ-BW-FCP-NEXT:    vporq %xmm18, %xmm17, %xmm17
14735; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm17
14736; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm9 {%k2}
14737; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm17 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
14738; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm17, %zmm17
14739; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm18 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
14740; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm18, %zmm18
14741; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm20 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
14742; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm20, %zmm20
14743; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm19, %xmm2
14744; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u]
14745; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u]
14746; AVX512DQ-BW-FCP-NEXT:    vporq %xmm2, %xmm19, %xmm2
14747; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14748; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm19 = [1,2,4,6,0,0,0,0]
14749; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm20
14750; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm20, %ymm19, %ymm19
14751; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
14752; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm19, %zmm10, %zmm10
14753; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm10, %zmm2 {%k5}
14754; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm10
14755; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u]
14756; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u]
14757; AVX512DQ-BW-FCP-NEXT:    vporq %xmm10, %xmm16, %xmm10
14758; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm10 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14759; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [1,3,4,6,0,0,0,0]
14760; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm20, %ymm16, %ymm16
14761; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
14762; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm16, %zmm11, %zmm11
14763; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm11, %zmm10 {%k5}
14764; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
14765; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm3
14766; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u]
14767; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm11, %xmm3
14768; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14769; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [1,3,5,6,0,0,0,0]
14770; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm20, %ymm11, %ymm11
14771; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
14772; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm11, %zmm9, %zmm9
14773; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm9, %zmm3 {%k5}
14774; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15]
14775; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm9, %zmm0
14776; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm4 {%k1}
14777; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u]
14778; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
14779; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u]
14780; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm4, %xmm4
14781; AVX512DQ-BW-FCP-NEXT:    movl $4186112, %eax # imm = 0x3FE000
14782; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
14783; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
14784; AVX512DQ-BW-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
14785; AVX512DQ-BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
14786; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm7, %ymm5 {%k1}
14787; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
14788; AVX512DQ-BW-FCP-NEXT:    vinserti32x8 $1, %ymm14, %zmm0, %zmm2 {%k1}
14789; AVX512DQ-BW-FCP-NEXT:    vinserti32x8 $1, %ymm15, %zmm0, %zmm10 {%k1}
14790; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm0
14791; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
14792; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero
14793; AVX512DQ-BW-FCP-NEXT:    vpor %xmm0, %xmm5, %xmm0
14794; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
14795; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15]
14796; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
14797; AVX512DQ-BW-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1}
14798; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
14799; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
14800; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, (%rsi)
14801; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, (%rdx)
14802; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, (%rcx)
14803; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, (%r8)
14804; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, (%r9)
14805; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, (%rdi)
14806; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
14807; AVX512DQ-BW-FCP-NEXT:    vzeroupper
14808; AVX512DQ-BW-FCP-NEXT:    retq
14809  %wide.vec = load <448 x i8>, ptr %in.vec, align 64
14810  %strided.vec0 = shufflevector <448 x i8> %wide.vec, <448 x i8> poison, <64 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105, i32 112, i32 119, i32 126, i32 133, i32 140, i32 147, i32 154, i32 161, i32 168, i32 175, i32 182, i32 189, i32 196, i32 203, i32 210, i32 217, i32 224, i32 231, i32 238, i32 245, i32 252, i32 259, i32 266, i32 273, i32 280, i32 287, i32 294, i32 301, i32 308, i32 315, i32 322, i32 329, i32 336, i32 343, i32 350, i32 357, i32 364, i32 371, i32 378, i32 385, i32 392, i32 399, i32 406, i32 413, i32 420, i32 427, i32 434, i32 441>
14811  %strided.vec1 = shufflevector <448 x i8> %wide.vec, <448 x i8> poison, <64 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106, i32 113, i32 120, i32 127, i32 134, i32 141, i32 148, i32 155, i32 162, i32 169, i32 176, i32 183, i32 190, i32 197, i32 204, i32 211, i32 218, i32 225, i32 232, i32 239, i32 246, i32 253, i32 260, i32 267, i32 274, i32 281, i32 288, i32 295, i32 302, i32 309, i32 316, i32 323, i32 330, i32 337, i32 344, i32 351, i32 358, i32 365, i32 372, i32 379, i32 386, i32 393, i32 400, i32 407, i32 414, i32 421, i32 428, i32 435, i32 442>
14812  %strided.vec2 = shufflevector <448 x i8> %wide.vec, <448 x i8> poison, <64 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107, i32 114, i32 121, i32 128, i32 135, i32 142, i32 149, i32 156, i32 163, i32 170, i32 177, i32 184, i32 191, i32 198, i32 205, i32 212, i32 219, i32 226, i32 233, i32 240, i32 247, i32 254, i32 261, i32 268, i32 275, i32 282, i32 289, i32 296, i32 303, i32 310, i32 317, i32 324, i32 331, i32 338, i32 345, i32 352, i32 359, i32 366, i32 373, i32 380, i32 387, i32 394, i32 401, i32 408, i32 415, i32 422, i32 429, i32 436, i32 443>
14813  %strided.vec3 = shufflevector <448 x i8> %wide.vec, <448 x i8> poison, <64 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108, i32 115, i32 122, i32 129, i32 136, i32 143, i32 150, i32 157, i32 164, i32 171, i32 178, i32 185, i32 192, i32 199, i32 206, i32 213, i32 220, i32 227, i32 234, i32 241, i32 248, i32 255, i32 262, i32 269, i32 276, i32 283, i32 290, i32 297, i32 304, i32 311, i32 318, i32 325, i32 332, i32 339, i32 346, i32 353, i32 360, i32 367, i32 374, i32 381, i32 388, i32 395, i32 402, i32 409, i32 416, i32 423, i32 430, i32 437, i32 444>
14814  %strided.vec4 = shufflevector <448 x i8> %wide.vec, <448 x i8> poison, <64 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109, i32 116, i32 123, i32 130, i32 137, i32 144, i32 151, i32 158, i32 165, i32 172, i32 179, i32 186, i32 193, i32 200, i32 207, i32 214, i32 221, i32 228, i32 235, i32 242, i32 249, i32 256, i32 263, i32 270, i32 277, i32 284, i32 291, i32 298, i32 305, i32 312, i32 319, i32 326, i32 333, i32 340, i32 347, i32 354, i32 361, i32 368, i32 375, i32 382, i32 389, i32 396, i32 403, i32 410, i32 417, i32 424, i32 431, i32 438, i32 445>
14815  %strided.vec5 = shufflevector <448 x i8> %wide.vec, <448 x i8> poison, <64 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110, i32 117, i32 124, i32 131, i32 138, i32 145, i32 152, i32 159, i32 166, i32 173, i32 180, i32 187, i32 194, i32 201, i32 208, i32 215, i32 222, i32 229, i32 236, i32 243, i32 250, i32 257, i32 264, i32 271, i32 278, i32 285, i32 292, i32 299, i32 306, i32 313, i32 320, i32 327, i32 334, i32 341, i32 348, i32 355, i32 362, i32 369, i32 376, i32 383, i32 390, i32 397, i32 404, i32 411, i32 418, i32 425, i32 432, i32 439, i32 446>
14816  %strided.vec6 = shufflevector <448 x i8> %wide.vec, <448 x i8> poison, <64 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111, i32 118, i32 125, i32 132, i32 139, i32 146, i32 153, i32 160, i32 167, i32 174, i32 181, i32 188, i32 195, i32 202, i32 209, i32 216, i32 223, i32 230, i32 237, i32 244, i32 251, i32 258, i32 265, i32 272, i32 279, i32 286, i32 293, i32 300, i32 307, i32 314, i32 321, i32 328, i32 335, i32 342, i32 349, i32 356, i32 363, i32 370, i32 377, i32 384, i32 391, i32 398, i32 405, i32 412, i32 419, i32 426, i32 433, i32 440, i32 447>
14817  store <64 x i8> %strided.vec0, ptr %out.vec0, align 64
14818  store <64 x i8> %strided.vec1, ptr %out.vec1, align 64
14819  store <64 x i8> %strided.vec2, ptr %out.vec2, align 64
14820  store <64 x i8> %strided.vec3, ptr %out.vec3, align 64
14821  store <64 x i8> %strided.vec4, ptr %out.vec4, align 64
14822  store <64 x i8> %strided.vec5, ptr %out.vec5, align 64
14823  store <64 x i8> %strided.vec6, ptr %out.vec6, align 64
14824  ret void
14825}
14826