xref: /llvm-project/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll (revision 8ac00ca4867835cacaf013f5c442658b9b1bce38)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx  | FileCheck %s --check-prefixes=AVX
4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
15
16; These patterns are produced by LoopVectorizer for interleaved loads.
17
18define void @load_i8_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
19; SSE-LABEL: load_i8_stride5_vf2:
20; SSE:       # %bb.0:
21; SSE-NEXT:    movdqa (%rdi), %xmm1
22; SSE-NEXT:    pxor %xmm2, %xmm2
23; SSE-NEXT:    movdqa %xmm1, %xmm0
24; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
25; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
26; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
27; SSE-NEXT:    packuswb %xmm3, %xmm3
28; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,3,2,3]
29; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7]
30; SSE-NEXT:    packuswb %xmm4, %xmm4
31; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[3,1,2,3]
32; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
33; SSE-NEXT:    packuswb %xmm5, %xmm5
34; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
35; SSE-NEXT:    psrlq $48, %xmm0
36; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
37; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
38; SSE-NEXT:    packuswb %xmm0, %xmm0
39; SSE-NEXT:    psrld $16, %xmm1
40; SSE-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
41; SSE-NEXT:    packuswb %xmm6, %xmm6
42; SSE-NEXT:    movd %xmm3, %eax
43; SSE-NEXT:    movw %ax, (%rsi)
44; SSE-NEXT:    movd %xmm4, %eax
45; SSE-NEXT:    movw %ax, (%rdx)
46; SSE-NEXT:    movd %xmm5, %eax
47; SSE-NEXT:    movw %ax, (%rcx)
48; SSE-NEXT:    movd %xmm0, %eax
49; SSE-NEXT:    movw %ax, (%r8)
50; SSE-NEXT:    movd %xmm6, %eax
51; SSE-NEXT:    movw %ax, (%r9)
52; SSE-NEXT:    retq
53;
54; AVX-LABEL: load_i8_stride5_vf2:
55; AVX:       # %bb.0:
56; AVX-NEXT:    vmovdqa (%rdi), %xmm0
57; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
58; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
59; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
60; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
61; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
62; AVX-NEXT:    vpextrw $0, %xmm1, (%rsi)
63; AVX-NEXT:    vpextrw $0, %xmm2, (%rdx)
64; AVX-NEXT:    vpextrw $0, %xmm3, (%rcx)
65; AVX-NEXT:    vpextrw $0, %xmm4, (%r8)
66; AVX-NEXT:    vpextrw $0, %xmm0, (%r9)
67; AVX-NEXT:    retq
68;
69; AVX2-LABEL: load_i8_stride5_vf2:
70; AVX2:       # %bb.0:
71; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
72; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
73; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
74; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
75; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
76; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
77; AVX2-NEXT:    vpextrw $0, %xmm1, (%rsi)
78; AVX2-NEXT:    vpextrw $0, %xmm2, (%rdx)
79; AVX2-NEXT:    vpextrw $0, %xmm3, (%rcx)
80; AVX2-NEXT:    vpextrw $0, %xmm4, (%r8)
81; AVX2-NEXT:    vpextrw $0, %xmm0, (%r9)
82; AVX2-NEXT:    retq
83;
84; AVX2-FP-LABEL: load_i8_stride5_vf2:
85; AVX2-FP:       # %bb.0:
86; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
87; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
88; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
89; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
90; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
91; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
92; AVX2-FP-NEXT:    vpextrw $0, %xmm1, (%rsi)
93; AVX2-FP-NEXT:    vpextrw $0, %xmm2, (%rdx)
94; AVX2-FP-NEXT:    vpextrw $0, %xmm3, (%rcx)
95; AVX2-FP-NEXT:    vpextrw $0, %xmm4, (%r8)
96; AVX2-FP-NEXT:    vpextrw $0, %xmm0, (%r9)
97; AVX2-FP-NEXT:    retq
98;
99; AVX2-FCP-LABEL: load_i8_stride5_vf2:
100; AVX2-FCP:       # %bb.0:
101; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm0
102; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
103; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
104; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
105; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
106; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
107; AVX2-FCP-NEXT:    vpextrw $0, %xmm1, (%rsi)
108; AVX2-FCP-NEXT:    vpextrw $0, %xmm2, (%rdx)
109; AVX2-FCP-NEXT:    vpextrw $0, %xmm3, (%rcx)
110; AVX2-FCP-NEXT:    vpextrw $0, %xmm4, (%r8)
111; AVX2-FCP-NEXT:    vpextrw $0, %xmm0, (%r9)
112; AVX2-FCP-NEXT:    retq
113;
114; AVX512-LABEL: load_i8_stride5_vf2:
115; AVX512:       # %bb.0:
116; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
117; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
118; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
119; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
120; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
121; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
122; AVX512-NEXT:    vpextrw $0, %xmm1, (%rsi)
123; AVX512-NEXT:    vpextrw $0, %xmm2, (%rdx)
124; AVX512-NEXT:    vpextrw $0, %xmm3, (%rcx)
125; AVX512-NEXT:    vpextrw $0, %xmm4, (%r8)
126; AVX512-NEXT:    vpextrw $0, %xmm0, (%r9)
127; AVX512-NEXT:    retq
128;
129; AVX512-FCP-LABEL: load_i8_stride5_vf2:
130; AVX512-FCP:       # %bb.0:
131; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
132; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
133; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
134; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
135; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
136; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
137; AVX512-FCP-NEXT:    vpextrw $0, %xmm1, (%rsi)
138; AVX512-FCP-NEXT:    vpextrw $0, %xmm2, (%rdx)
139; AVX512-FCP-NEXT:    vpextrw $0, %xmm3, (%rcx)
140; AVX512-FCP-NEXT:    vpextrw $0, %xmm4, (%r8)
141; AVX512-FCP-NEXT:    vpextrw $0, %xmm0, (%r9)
142; AVX512-FCP-NEXT:    retq
143;
144; AVX512DQ-LABEL: load_i8_stride5_vf2:
145; AVX512DQ:       # %bb.0:
146; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
147; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
148; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
149; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
150; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
151; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
152; AVX512DQ-NEXT:    vpextrw $0, %xmm1, (%rsi)
153; AVX512DQ-NEXT:    vpextrw $0, %xmm2, (%rdx)
154; AVX512DQ-NEXT:    vpextrw $0, %xmm3, (%rcx)
155; AVX512DQ-NEXT:    vpextrw $0, %xmm4, (%r8)
156; AVX512DQ-NEXT:    vpextrw $0, %xmm0, (%r9)
157; AVX512DQ-NEXT:    retq
158;
159; AVX512DQ-FCP-LABEL: load_i8_stride5_vf2:
160; AVX512DQ-FCP:       # %bb.0:
161; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
162; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
163; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
164; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
165; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
166; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
167; AVX512DQ-FCP-NEXT:    vpextrw $0, %xmm1, (%rsi)
168; AVX512DQ-FCP-NEXT:    vpextrw $0, %xmm2, (%rdx)
169; AVX512DQ-FCP-NEXT:    vpextrw $0, %xmm3, (%rcx)
170; AVX512DQ-FCP-NEXT:    vpextrw $0, %xmm4, (%r8)
171; AVX512DQ-FCP-NEXT:    vpextrw $0, %xmm0, (%r9)
172; AVX512DQ-FCP-NEXT:    retq
173;
174; AVX512BW-LABEL: load_i8_stride5_vf2:
175; AVX512BW:       # %bb.0:
176; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
177; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
178; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
179; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
180; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
181; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
182; AVX512BW-NEXT:    vpextrw $0, %xmm1, (%rsi)
183; AVX512BW-NEXT:    vpextrw $0, %xmm2, (%rdx)
184; AVX512BW-NEXT:    vpextrw $0, %xmm3, (%rcx)
185; AVX512BW-NEXT:    vpextrw $0, %xmm4, (%r8)
186; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%r9)
187; AVX512BW-NEXT:    retq
188;
189; AVX512BW-FCP-LABEL: load_i8_stride5_vf2:
190; AVX512BW-FCP:       # %bb.0:
191; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
192; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
193; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
194; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
195; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
196; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
197; AVX512BW-FCP-NEXT:    vpextrw $0, %xmm1, (%rsi)
198; AVX512BW-FCP-NEXT:    vpextrw $0, %xmm2, (%rdx)
199; AVX512BW-FCP-NEXT:    vpextrw $0, %xmm3, (%rcx)
200; AVX512BW-FCP-NEXT:    vpextrw $0, %xmm4, (%r8)
201; AVX512BW-FCP-NEXT:    vpextrw $0, %xmm0, (%r9)
202; AVX512BW-FCP-NEXT:    retq
203;
204; AVX512DQ-BW-LABEL: load_i8_stride5_vf2:
205; AVX512DQ-BW:       # %bb.0:
206; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
207; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
208; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
209; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
210; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
211; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
212; AVX512DQ-BW-NEXT:    vpextrw $0, %xmm1, (%rsi)
213; AVX512DQ-BW-NEXT:    vpextrw $0, %xmm2, (%rdx)
214; AVX512DQ-BW-NEXT:    vpextrw $0, %xmm3, (%rcx)
215; AVX512DQ-BW-NEXT:    vpextrw $0, %xmm4, (%r8)
216; AVX512DQ-BW-NEXT:    vpextrw $0, %xmm0, (%r9)
217; AVX512DQ-BW-NEXT:    retq
218;
219; AVX512DQ-BW-FCP-LABEL: load_i8_stride5_vf2:
220; AVX512DQ-BW-FCP:       # %bb.0:
221; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
222; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
223; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
224; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
225; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
226; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
227; AVX512DQ-BW-FCP-NEXT:    vpextrw $0, %xmm1, (%rsi)
228; AVX512DQ-BW-FCP-NEXT:    vpextrw $0, %xmm2, (%rdx)
229; AVX512DQ-BW-FCP-NEXT:    vpextrw $0, %xmm3, (%rcx)
230; AVX512DQ-BW-FCP-NEXT:    vpextrw $0, %xmm4, (%r8)
231; AVX512DQ-BW-FCP-NEXT:    vpextrw $0, %xmm0, (%r9)
232; AVX512DQ-BW-FCP-NEXT:    retq
233  %wide.vec = load <10 x i8>, ptr %in.vec, align 64
234  %strided.vec0 = shufflevector <10 x i8> %wide.vec, <10 x i8> poison, <2 x i32> <i32 0, i32 5>
235  %strided.vec1 = shufflevector <10 x i8> %wide.vec, <10 x i8> poison, <2 x i32> <i32 1, i32 6>
236  %strided.vec2 = shufflevector <10 x i8> %wide.vec, <10 x i8> poison, <2 x i32> <i32 2, i32 7>
237  %strided.vec3 = shufflevector <10 x i8> %wide.vec, <10 x i8> poison, <2 x i32> <i32 3, i32 8>
238  %strided.vec4 = shufflevector <10 x i8> %wide.vec, <10 x i8> poison, <2 x i32> <i32 4, i32 9>
239  store <2 x i8> %strided.vec0, ptr %out.vec0, align 64
240  store <2 x i8> %strided.vec1, ptr %out.vec1, align 64
241  store <2 x i8> %strided.vec2, ptr %out.vec2, align 64
242  store <2 x i8> %strided.vec3, ptr %out.vec3, align 64
243  store <2 x i8> %strided.vec4, ptr %out.vec4, align 64
244  ret void
245}
246
247define void @load_i8_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
248; SSE-LABEL: load_i8_stride5_vf4:
249; SSE:       # %bb.0:
250; SSE-NEXT:    movdqa (%rdi), %xmm5
251; SSE-NEXT:    movdqa 16(%rdi), %xmm0
252; SSE-NEXT:    pxor %xmm4, %xmm4
253; SSE-NEXT:    movdqa %xmm5, %xmm2
254; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
255; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
256; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm1[2,1,2,3,4,5,6,7]
257; SSE-NEXT:    movdqa %xmm5, %xmm3
258; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
259; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
260; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
261; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
262; SSE-NEXT:    packuswb %xmm1, %xmm1
263; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
264; SSE-NEXT:    movdqa %xmm5, %xmm7
265; SSE-NEXT:    pand %xmm6, %xmm7
266; SSE-NEXT:    pandn %xmm0, %xmm6
267; SSE-NEXT:    por %xmm7, %xmm6
268; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
269; SSE-NEXT:    movdqa %xmm2, %xmm7
270; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,0],xmm6[0,0]
271; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[2,3]
272; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1,1,3]
273; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,6,5,6,7]
274; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
275; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[1,2,3,0,4,5,6,7]
276; SSE-NEXT:    packuswb %xmm6, %xmm6
277; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,65535,65535,65535,65535]
278; SSE-NEXT:    movdqa %xmm5, %xmm8
279; SSE-NEXT:    pand %xmm7, %xmm8
280; SSE-NEXT:    pandn %xmm0, %xmm7
281; SSE-NEXT:    por %xmm8, %xmm7
282; SSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
283; SSE-NEXT:    movdqa %xmm2, %xmm8
284; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm7[3,0]
285; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[0,2]
286; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7]
287; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
288; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
289; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,1,4,5,6,7]
290; SSE-NEXT:    packuswb %xmm7, %xmm7
291; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
292; SSE-NEXT:    pand %xmm8, %xmm5
293; SSE-NEXT:    pandn %xmm0, %xmm8
294; SSE-NEXT:    por %xmm5, %xmm8
295; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
296; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[2,0]
297; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,6,5,6,7]
298; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
299; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,0,1,2,4,5,6,7]
300; SSE-NEXT:    packuswb %xmm4, %xmm4
301; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[3,0]
302; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2]
303; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
304; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
305; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7]
306; SSE-NEXT:    packuswb %xmm2, %xmm2
307; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
308; SSE-NEXT:    pand %xmm3, %xmm2
309; SSE-NEXT:    pandn %xmm0, %xmm3
310; SSE-NEXT:    por %xmm2, %xmm3
311; SSE-NEXT:    movd %xmm1, (%rsi)
312; SSE-NEXT:    movd %xmm6, (%rdx)
313; SSE-NEXT:    movd %xmm7, (%rcx)
314; SSE-NEXT:    movd %xmm4, (%r8)
315; SSE-NEXT:    movd %xmm3, (%r9)
316; SSE-NEXT:    retq
317;
318; AVX-LABEL: load_i8_stride5_vf4:
319; AVX:       # %bb.0:
320; AVX-NEXT:    vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
321; AVX-NEXT:    vmovdqa (%rdi), %xmm1
322; AVX-NEXT:    vmovdqa 16(%rdi), %xmm2
323; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
324; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
325; AVX-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
326; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
327; AVX-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
328; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
329; AVX-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
330; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
331; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
332; AVX-NEXT:    vmovd %xmm3, (%rsi)
333; AVX-NEXT:    vmovd %xmm4, (%rdx)
334; AVX-NEXT:    vmovd %xmm5, (%rcx)
335; AVX-NEXT:    vmovd %xmm6, (%r8)
336; AVX-NEXT:    vmovd %xmm0, (%r9)
337; AVX-NEXT:    retq
338;
339; AVX2-LABEL: load_i8_stride5_vf4:
340; AVX2:       # %bb.0:
341; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
342; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
343; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm2
344; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
345; AVX2-NEXT:    vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
346; AVX2-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
347; AVX2-NEXT:    vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
348; AVX2-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
349; AVX2-NEXT:    vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
350; AVX2-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
351; AVX2-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
352; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
353; AVX2-NEXT:    vmovd %xmm3, (%rsi)
354; AVX2-NEXT:    vmovd %xmm4, (%rdx)
355; AVX2-NEXT:    vmovd %xmm5, (%rcx)
356; AVX2-NEXT:    vmovd %xmm6, (%r8)
357; AVX2-NEXT:    vmovd %xmm0, (%r9)
358; AVX2-NEXT:    retq
359;
360; AVX2-FP-LABEL: load_i8_stride5_vf4:
361; AVX2-FP:       # %bb.0:
362; AVX2-FP-NEXT:    vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
363; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm1
364; AVX2-FP-NEXT:    vmovdqa 16(%rdi), %xmm2
365; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
366; AVX2-FP-NEXT:    vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
367; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
368; AVX2-FP-NEXT:    vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
369; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
370; AVX2-FP-NEXT:    vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
371; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
372; AVX2-FP-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
373; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
374; AVX2-FP-NEXT:    vmovd %xmm3, (%rsi)
375; AVX2-FP-NEXT:    vmovd %xmm4, (%rdx)
376; AVX2-FP-NEXT:    vmovd %xmm5, (%rcx)
377; AVX2-FP-NEXT:    vmovd %xmm6, (%r8)
378; AVX2-FP-NEXT:    vmovd %xmm0, (%r9)
379; AVX2-FP-NEXT:    retq
380;
381; AVX2-FCP-LABEL: load_i8_stride5_vf4:
382; AVX2-FCP:       # %bb.0:
383; AVX2-FCP-NEXT:    vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
384; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm1
385; AVX2-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
386; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
387; AVX2-FCP-NEXT:    vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
388; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
389; AVX2-FCP-NEXT:    vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
390; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
391; AVX2-FCP-NEXT:    vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
392; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
393; AVX2-FCP-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
394; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
395; AVX2-FCP-NEXT:    vmovd %xmm3, (%rsi)
396; AVX2-FCP-NEXT:    vmovd %xmm4, (%rdx)
397; AVX2-FCP-NEXT:    vmovd %xmm5, (%rcx)
398; AVX2-FCP-NEXT:    vmovd %xmm6, (%r8)
399; AVX2-FCP-NEXT:    vmovd %xmm0, (%r9)
400; AVX2-FCP-NEXT:    retq
401;
402; AVX512-LABEL: load_i8_stride5_vf4:
403; AVX512:       # %bb.0:
404; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
405; AVX512-NEXT:    vmovdqa (%rdi), %xmm1
406; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm2
407; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
408; AVX512-NEXT:    vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
409; AVX512-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
410; AVX512-NEXT:    vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
411; AVX512-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
412; AVX512-NEXT:    vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
413; AVX512-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
414; AVX512-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
415; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
416; AVX512-NEXT:    vmovd %xmm3, (%rsi)
417; AVX512-NEXT:    vmovd %xmm4, (%rdx)
418; AVX512-NEXT:    vmovd %xmm5, (%rcx)
419; AVX512-NEXT:    vmovd %xmm6, (%r8)
420; AVX512-NEXT:    vmovd %xmm0, (%r9)
421; AVX512-NEXT:    retq
422;
423; AVX512-FCP-LABEL: load_i8_stride5_vf4:
424; AVX512-FCP:       # %bb.0:
425; AVX512-FCP-NEXT:    vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
426; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm1
427; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
428; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
429; AVX512-FCP-NEXT:    vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
430; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
431; AVX512-FCP-NEXT:    vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
432; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
433; AVX512-FCP-NEXT:    vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
434; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
435; AVX512-FCP-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
436; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
437; AVX512-FCP-NEXT:    vmovd %xmm3, (%rsi)
438; AVX512-FCP-NEXT:    vmovd %xmm4, (%rdx)
439; AVX512-FCP-NEXT:    vmovd %xmm5, (%rcx)
440; AVX512-FCP-NEXT:    vmovd %xmm6, (%r8)
441; AVX512-FCP-NEXT:    vmovd %xmm0, (%r9)
442; AVX512-FCP-NEXT:    retq
443;
444; AVX512DQ-LABEL: load_i8_stride5_vf4:
445; AVX512DQ:       # %bb.0:
446; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
447; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
448; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm2
449; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
450; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
451; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
452; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
453; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
454; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
455; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
456; AVX512DQ-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
457; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
458; AVX512DQ-NEXT:    vmovd %xmm3, (%rsi)
459; AVX512DQ-NEXT:    vmovd %xmm4, (%rdx)
460; AVX512DQ-NEXT:    vmovd %xmm5, (%rcx)
461; AVX512DQ-NEXT:    vmovd %xmm6, (%r8)
462; AVX512DQ-NEXT:    vmovd %xmm0, (%r9)
463; AVX512DQ-NEXT:    retq
464;
465; AVX512DQ-FCP-LABEL: load_i8_stride5_vf4:
466; AVX512DQ-FCP:       # %bb.0:
467; AVX512DQ-FCP-NEXT:    vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
468; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm1
469; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
470; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
471; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
472; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
473; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
474; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
475; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
476; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
477; AVX512DQ-FCP-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
478; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
479; AVX512DQ-FCP-NEXT:    vmovd %xmm3, (%rsi)
480; AVX512DQ-FCP-NEXT:    vmovd %xmm4, (%rdx)
481; AVX512DQ-FCP-NEXT:    vmovd %xmm5, (%rcx)
482; AVX512DQ-FCP-NEXT:    vmovd %xmm6, (%r8)
483; AVX512DQ-FCP-NEXT:    vmovd %xmm0, (%r9)
484; AVX512DQ-FCP-NEXT:    retq
485;
486; AVX512BW-LABEL: load_i8_stride5_vf4:
487; AVX512BW:       # %bb.0:
488; AVX512BW-NEXT:    vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
489; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm1
490; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm2
491; AVX512BW-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
492; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
493; AVX512BW-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
494; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
495; AVX512BW-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
496; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
497; AVX512BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
498; AVX512BW-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
499; AVX512BW-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
500; AVX512BW-NEXT:    vmovd %xmm3, (%rsi)
501; AVX512BW-NEXT:    vmovd %xmm4, (%rdx)
502; AVX512BW-NEXT:    vmovd %xmm5, (%rcx)
503; AVX512BW-NEXT:    vmovd %xmm6, (%r8)
504; AVX512BW-NEXT:    vmovd %xmm0, (%r9)
505; AVX512BW-NEXT:    retq
506;
507; AVX512BW-FCP-LABEL: load_i8_stride5_vf4:
508; AVX512BW-FCP:       # %bb.0:
509; AVX512BW-FCP-NEXT:    vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
510; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm1
511; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
512; AVX512BW-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
513; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
514; AVX512BW-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
515; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
516; AVX512BW-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
517; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
518; AVX512BW-FCP-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
519; AVX512BW-FCP-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
520; AVX512BW-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
521; AVX512BW-FCP-NEXT:    vmovd %xmm3, (%rsi)
522; AVX512BW-FCP-NEXT:    vmovd %xmm4, (%rdx)
523; AVX512BW-FCP-NEXT:    vmovd %xmm5, (%rcx)
524; AVX512BW-FCP-NEXT:    vmovd %xmm6, (%r8)
525; AVX512BW-FCP-NEXT:    vmovd %xmm0, (%r9)
526; AVX512BW-FCP-NEXT:    retq
527;
528; AVX512DQ-BW-LABEL: load_i8_stride5_vf4:
529; AVX512DQ-BW:       # %bb.0:
530; AVX512DQ-BW-NEXT:    vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
531; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm1
532; AVX512DQ-BW-NEXT:    vmovdqa 16(%rdi), %xmm2
533; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
534; AVX512DQ-BW-NEXT:    vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
535; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
536; AVX512DQ-BW-NEXT:    vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
537; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
538; AVX512DQ-BW-NEXT:    vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
539; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
540; AVX512DQ-BW-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
541; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
542; AVX512DQ-BW-NEXT:    vmovd %xmm3, (%rsi)
543; AVX512DQ-BW-NEXT:    vmovd %xmm4, (%rdx)
544; AVX512DQ-BW-NEXT:    vmovd %xmm5, (%rcx)
545; AVX512DQ-BW-NEXT:    vmovd %xmm6, (%r8)
546; AVX512DQ-BW-NEXT:    vmovd %xmm0, (%r9)
547; AVX512DQ-BW-NEXT:    retq
548;
549; AVX512DQ-BW-FCP-LABEL: load_i8_stride5_vf4:
550; AVX512DQ-BW-FCP:       # %bb.0:
551; AVX512DQ-BW-FCP-NEXT:    vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
552; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm1
553; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm2
554; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
555; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
556; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
557; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
558; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
559; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
560; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
561; AVX512DQ-BW-FCP-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
562; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
563; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm3, (%rsi)
564; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm4, (%rdx)
565; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm5, (%rcx)
566; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm6, (%r8)
567; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm0, (%r9)
568; AVX512DQ-BW-FCP-NEXT:    retq
569  %wide.vec = load <20 x i8>, ptr %in.vec, align 64
570  %strided.vec0 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
571  %strided.vec1 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 1, i32 6, i32 11, i32 16>
572  %strided.vec2 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 2, i32 7, i32 12, i32 17>
573  %strided.vec3 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 3, i32 8, i32 13, i32 18>
574  %strided.vec4 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 4, i32 9, i32 14, i32 19>
575  store <4 x i8> %strided.vec0, ptr %out.vec0, align 64
576  store <4 x i8> %strided.vec1, ptr %out.vec1, align 64
577  store <4 x i8> %strided.vec2, ptr %out.vec2, align 64
578  store <4 x i8> %strided.vec3, ptr %out.vec3, align 64
579  store <4 x i8> %strided.vec4, ptr %out.vec4, align 64
580  ret void
581}
582
583define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
584; SSE-LABEL: load_i8_stride5_vf8:
585; SSE:       # %bb.0:
586; SSE-NEXT:    movdqa (%rdi), %xmm4
587; SSE-NEXT:    movdqa 16(%rdi), %xmm3
588; SSE-NEXT:    movdqa 32(%rdi), %xmm0
589; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
590; SSE-NEXT:    movdqa %xmm1, %xmm2
591; SSE-NEXT:    pandn %xmm3, %xmm2
592; SSE-NEXT:    movdqa %xmm4, %xmm5
593; SSE-NEXT:    pand %xmm1, %xmm5
594; SSE-NEXT:    por %xmm2, %xmm5
595; SSE-NEXT:    pxor %xmm6, %xmm6
596; SSE-NEXT:    movdqa %xmm5, %xmm2
597; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
598; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,0,0,65535,65535]
599; SSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
600; SSE-NEXT:    pand %xmm7, %xmm5
601; SSE-NEXT:    pandn %xmm2, %xmm7
602; SSE-NEXT:    por %xmm5, %xmm7
603; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm7[0,2,1,3,4,5,6,7]
604; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7]
605; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
606; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7]
607; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,6,5,7]
608; SSE-NEXT:    packuswb %xmm7, %xmm7
609; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
610; SSE-NEXT:    pand %xmm2, %xmm7
611; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[0,0,1,1]
612; SSE-NEXT:    movdqa %xmm2, %xmm5
613; SSE-NEXT:    pandn %xmm8, %xmm5
614; SSE-NEXT:    por %xmm7, %xmm5
615; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
616; SSE-NEXT:    movdqa %xmm3, %xmm8
617; SSE-NEXT:    pand %xmm7, %xmm8
618; SSE-NEXT:    pandn %xmm4, %xmm7
619; SSE-NEXT:    por %xmm8, %xmm7
620; SSE-NEXT:    movdqa %xmm7, %xmm8
621; SSE-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
622; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [65535,65535,0,0,65535,65535,65535,0]
623; SSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
624; SSE-NEXT:    pand %xmm9, %xmm7
625; SSE-NEXT:    pandn %xmm8, %xmm9
626; SSE-NEXT:    por %xmm7, %xmm9
627; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm9[0,2,1,3]
628; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,5]
629; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,3,2,1]
630; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7]
631; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,4,5,7]
632; SSE-NEXT:    packuswb %xmm7, %xmm7
633; SSE-NEXT:    pand %xmm2, %xmm7
634; SSE-NEXT:    movdqa %xmm0, %xmm8
635; SSE-NEXT:    pslld $24, %xmm8
636; SSE-NEXT:    pandn %xmm8, %xmm2
637; SSE-NEXT:    por %xmm7, %xmm2
638; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
639; SSE-NEXT:    movdqa %xmm3, %xmm8
640; SSE-NEXT:    pand %xmm7, %xmm8
641; SSE-NEXT:    pandn %xmm4, %xmm7
642; SSE-NEXT:    por %xmm8, %xmm7
643; SSE-NEXT:    movdqa %xmm7, %xmm9
644; SSE-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15]
645; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,65535,65535,65535]
646; SSE-NEXT:    movdqa %xmm8, %xmm10
647; SSE-NEXT:    pandn %xmm9, %xmm10
648; SSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
649; SSE-NEXT:    pand %xmm8, %xmm7
650; SSE-NEXT:    por %xmm10, %xmm7
651; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7]
652; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
653; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3]
654; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,1,4,5,6,7]
655; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,5,6,7]
656; SSE-NEXT:    packuswb %xmm10, %xmm10
657; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535]
658; SSE-NEXT:    pand %xmm7, %xmm10
659; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
660; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[0,1,2,0]
661; SSE-NEXT:    pshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5,6,5]
662; SSE-NEXT:    packuswb %xmm11, %xmm11
663; SSE-NEXT:    movdqa %xmm7, %xmm9
664; SSE-NEXT:    pandn %xmm11, %xmm9
665; SSE-NEXT:    por %xmm10, %xmm9
666; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
667; SSE-NEXT:    movdqa %xmm3, %xmm11
668; SSE-NEXT:    pand %xmm10, %xmm11
669; SSE-NEXT:    pandn %xmm4, %xmm10
670; SSE-NEXT:    por %xmm11, %xmm10
671; SSE-NEXT:    movdqa %xmm10, %xmm11
672; SSE-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15]
673; SSE-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
674; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[3,1],xmm11[2,0]
675; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6,5]
676; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[3,1,2,0]
677; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm10[3,0,1,2,4,5,6,7]
678; SSE-NEXT:    pshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,7,4,6,7]
679; SSE-NEXT:    packuswb %xmm11, %xmm11
680; SSE-NEXT:    pand %xmm7, %xmm11
681; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,1,0,3]
682; SSE-NEXT:    pshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,5,6]
683; SSE-NEXT:    packuswb %xmm12, %xmm12
684; SSE-NEXT:    movdqa %xmm7, %xmm10
685; SSE-NEXT:    pandn %xmm12, %xmm10
686; SSE-NEXT:    por %xmm11, %xmm10
687; SSE-NEXT:    pand %xmm1, %xmm3
688; SSE-NEXT:    pandn %xmm4, %xmm1
689; SSE-NEXT:    por %xmm3, %xmm1
690; SSE-NEXT:    movdqa %xmm1, %xmm3
691; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
692; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
693; SSE-NEXT:    pand %xmm8, %xmm1
694; SSE-NEXT:    pandn %xmm3, %xmm8
695; SSE-NEXT:    por %xmm1, %xmm8
696; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm8[3,1,0,3,4,5,6,7]
697; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4]
698; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
699; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7]
700; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
701; SSE-NEXT:    packuswb %xmm1, %xmm1
702; SSE-NEXT:    pand %xmm7, %xmm1
703; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
704; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
705; SSE-NEXT:    packuswb %xmm0, %xmm0
706; SSE-NEXT:    pandn %xmm0, %xmm7
707; SSE-NEXT:    por %xmm1, %xmm7
708; SSE-NEXT:    movq %xmm5, (%rsi)
709; SSE-NEXT:    movq %xmm2, (%rdx)
710; SSE-NEXT:    movq %xmm9, (%rcx)
711; SSE-NEXT:    movq %xmm10, (%r8)
712; SSE-NEXT:    movq %xmm7, (%r9)
713; SSE-NEXT:    retq
714;
715; AVX-LABEL: load_i8_stride5_vf8:
716; AVX:       # %bb.0:
717; AVX-NEXT:    vmovdqa (%rdi), %xmm0
718; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
719; AVX-NEXT:    vmovdqa 32(%rdi), %xmm2
720; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
721; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
722; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
723; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
724; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,12,7,u,u,u,u,u,u,u,u]
725; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,6,11],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
726; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[0,5,10,15,u,u,u,u,u,u,u,u,u]
727; AVX-NEXT:    vpor %xmm4, %xmm5, %xmm4
728; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
729; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,2,4,6,8,10,12,9,u,u,u,u,u,u,u,u]
730; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11,u,u,u,u,u,u,u,u,u,u]
731; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u]
732; AVX-NEXT:    vpor %xmm5, %xmm6, %xmm5
733; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
734; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,2,4,6,8,10,1,11,u,u,u,u,u,u,u,u]
735; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12,u,u,u,u,u,u,u,u,u,u]
736; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u]
737; AVX-NEXT:    vpor %xmm6, %xmm7, %xmm6
738; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
739; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[0,2,4,6,8,10,3,13,u,u,u,u,u,u,u,u]
740; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13,u,u,u,u,u,u,u,u,u,u]
741; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u]
742; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
743; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
744; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,5,15,u,u,u,u,u,u,u,u]
745; AVX-NEXT:    vmovq %xmm3, (%rsi)
746; AVX-NEXT:    vmovq %xmm4, (%rdx)
747; AVX-NEXT:    vmovq %xmm5, (%rcx)
748; AVX-NEXT:    vmovq %xmm6, (%r8)
749; AVX-NEXT:    vmovq %xmm0, (%r9)
750; AVX-NEXT:    retq
751;
752; AVX2-LABEL: load_i8_stride5_vf8:
753; AVX2:       # %bb.0:
754; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
755; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
756; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm2
757; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u]
758; AVX2-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
759; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
760; AVX2-NEXT:    vpor %xmm3, %xmm4, %xmm3
761; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
762; AVX2-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
763; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
764; AVX2-NEXT:    vpor %xmm4, %xmm5, %xmm4
765; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
766; AVX2-NEXT:    vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
767; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
768; AVX2-NEXT:    vpor %xmm5, %xmm6, %xmm5
769; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
770; AVX2-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
771; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
772; AVX2-NEXT:    vpor %xmm6, %xmm7, %xmm6
773; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
774; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
775; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
776; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
777; AVX2-NEXT:    vmovq %xmm3, (%rsi)
778; AVX2-NEXT:    vmovq %xmm4, (%rdx)
779; AVX2-NEXT:    vmovq %xmm5, (%rcx)
780; AVX2-NEXT:    vmovq %xmm6, (%r8)
781; AVX2-NEXT:    vmovq %xmm0, (%r9)
782; AVX2-NEXT:    retq
783;
784; AVX2-FP-LABEL: load_i8_stride5_vf8:
785; AVX2-FP:       # %bb.0:
786; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
787; AVX2-FP-NEXT:    vmovdqa 16(%rdi), %xmm1
788; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm2
789; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u]
790; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
791; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
792; AVX2-FP-NEXT:    vpor %xmm3, %xmm4, %xmm3
793; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
794; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
795; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
796; AVX2-FP-NEXT:    vpor %xmm4, %xmm5, %xmm4
797; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
798; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
799; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
800; AVX2-FP-NEXT:    vpor %xmm5, %xmm6, %xmm5
801; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
802; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
803; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
804; AVX2-FP-NEXT:    vpor %xmm6, %xmm7, %xmm6
805; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
806; AVX2-FP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
807; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
808; AVX2-FP-NEXT:    vpor %xmm1, %xmm0, %xmm0
809; AVX2-FP-NEXT:    vmovq %xmm3, (%rsi)
810; AVX2-FP-NEXT:    vmovq %xmm4, (%rdx)
811; AVX2-FP-NEXT:    vmovq %xmm5, (%rcx)
812; AVX2-FP-NEXT:    vmovq %xmm6, (%r8)
813; AVX2-FP-NEXT:    vmovq %xmm0, (%r9)
814; AVX2-FP-NEXT:    retq
815;
816; AVX2-FCP-LABEL: load_i8_stride5_vf8:
817; AVX2-FCP:       # %bb.0:
818; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm0
819; AVX2-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
820; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
821; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u]
822; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
823; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
824; AVX2-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
825; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
826; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
827; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
828; AVX2-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
829; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
830; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
831; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
832; AVX2-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
833; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
834; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
835; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
836; AVX2-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
837; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
838; AVX2-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
839; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
840; AVX2-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
841; AVX2-FCP-NEXT:    vmovq %xmm3, (%rsi)
842; AVX2-FCP-NEXT:    vmovq %xmm4, (%rdx)
843; AVX2-FCP-NEXT:    vmovq %xmm5, (%rcx)
844; AVX2-FCP-NEXT:    vmovq %xmm6, (%r8)
845; AVX2-FCP-NEXT:    vmovq %xmm0, (%r9)
846; AVX2-FCP-NEXT:    retq
847;
848; AVX512-LABEL: load_i8_stride5_vf8:
849; AVX512:       # %bb.0:
850; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
851; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
852; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm2
853; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u]
854; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
855; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
856; AVX512-NEXT:    vpor %xmm3, %xmm4, %xmm3
857; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
858; AVX512-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
859; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
860; AVX512-NEXT:    vpor %xmm4, %xmm5, %xmm4
861; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
862; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
863; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
864; AVX512-NEXT:    vpor %xmm5, %xmm6, %xmm5
865; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
866; AVX512-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
867; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
868; AVX512-NEXT:    vpor %xmm6, %xmm7, %xmm6
869; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
870; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
871; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
872; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
873; AVX512-NEXT:    vmovq %xmm3, (%rsi)
874; AVX512-NEXT:    vmovq %xmm4, (%rdx)
875; AVX512-NEXT:    vmovq %xmm5, (%rcx)
876; AVX512-NEXT:    vmovq %xmm6, (%r8)
877; AVX512-NEXT:    vmovq %xmm0, (%r9)
878; AVX512-NEXT:    retq
879;
880; AVX512-FCP-LABEL: load_i8_stride5_vf8:
881; AVX512-FCP:       # %bb.0:
882; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
883; AVX512-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
884; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
885; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u]
886; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
887; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
888; AVX512-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
889; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
890; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
891; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
892; AVX512-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
893; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
894; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
895; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
896; AVX512-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
897; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
898; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
899; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
900; AVX512-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
901; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
902; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
903; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
904; AVX512-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
905; AVX512-FCP-NEXT:    vmovq %xmm3, (%rsi)
906; AVX512-FCP-NEXT:    vmovq %xmm4, (%rdx)
907; AVX512-FCP-NEXT:    vmovq %xmm5, (%rcx)
908; AVX512-FCP-NEXT:    vmovq %xmm6, (%r8)
909; AVX512-FCP-NEXT:    vmovq %xmm0, (%r9)
910; AVX512-FCP-NEXT:    retq
911;
912; AVX512DQ-LABEL: load_i8_stride5_vf8:
913; AVX512DQ:       # %bb.0:
914; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
915; AVX512DQ-NEXT:    vmovdqa 16(%rdi), %xmm1
916; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm2
917; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u]
918; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
919; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
920; AVX512DQ-NEXT:    vpor %xmm3, %xmm4, %xmm3
921; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
922; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
923; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
924; AVX512DQ-NEXT:    vpor %xmm4, %xmm5, %xmm4
925; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
926; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
927; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
928; AVX512DQ-NEXT:    vpor %xmm5, %xmm6, %xmm5
929; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
930; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
931; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
932; AVX512DQ-NEXT:    vpor %xmm6, %xmm7, %xmm6
933; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
934; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
935; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
936; AVX512DQ-NEXT:    vpor %xmm1, %xmm0, %xmm0
937; AVX512DQ-NEXT:    vmovq %xmm3, (%rsi)
938; AVX512DQ-NEXT:    vmovq %xmm4, (%rdx)
939; AVX512DQ-NEXT:    vmovq %xmm5, (%rcx)
940; AVX512DQ-NEXT:    vmovq %xmm6, (%r8)
941; AVX512DQ-NEXT:    vmovq %xmm0, (%r9)
942; AVX512DQ-NEXT:    retq
943;
944; AVX512DQ-FCP-LABEL: load_i8_stride5_vf8:
945; AVX512DQ-FCP:       # %bb.0:
946; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
947; AVX512DQ-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
948; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
949; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u]
950; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
951; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
952; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
953; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
954; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
955; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
956; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
957; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
958; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
959; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
960; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
961; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
962; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
963; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
964; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
965; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
966; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
967; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
968; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
969; AVX512DQ-FCP-NEXT:    vmovq %xmm3, (%rsi)
970; AVX512DQ-FCP-NEXT:    vmovq %xmm4, (%rdx)
971; AVX512DQ-FCP-NEXT:    vmovq %xmm5, (%rcx)
972; AVX512DQ-FCP-NEXT:    vmovq %xmm6, (%r8)
973; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%r9)
974; AVX512DQ-FCP-NEXT:    retq
975;
976; AVX512BW-LABEL: load_i8_stride5_vf8:
977; AVX512BW:       # %bb.0:
978; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
979; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
980; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
981; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u]
982; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
983; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
984; AVX512BW-NEXT:    vpor %xmm3, %xmm4, %xmm3
985; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
986; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
987; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
988; AVX512BW-NEXT:    vpor %xmm4, %xmm5, %xmm4
989; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
990; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
991; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
992; AVX512BW-NEXT:    vpor %xmm5, %xmm6, %xmm5
993; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
994; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
995; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
996; AVX512BW-NEXT:    vpor %xmm6, %xmm7, %xmm6
997; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
998; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
999; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
1000; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1001; AVX512BW-NEXT:    vmovq %xmm3, (%rsi)
1002; AVX512BW-NEXT:    vmovq %xmm4, (%rdx)
1003; AVX512BW-NEXT:    vmovq %xmm5, (%rcx)
1004; AVX512BW-NEXT:    vmovq %xmm6, (%r8)
1005; AVX512BW-NEXT:    vmovq %xmm0, (%r9)
1006; AVX512BW-NEXT:    retq
1007;
1008; AVX512BW-FCP-LABEL: load_i8_stride5_vf8:
1009; AVX512BW-FCP:       # %bb.0:
1010; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
1011; AVX512BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
1012; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
1013; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u]
1014; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
1015; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
1016; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
1017; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
1018; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
1019; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
1020; AVX512BW-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
1021; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
1022; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
1023; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
1024; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
1025; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
1026; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
1027; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
1028; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
1029; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
1030; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
1031; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
1032; AVX512BW-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
1033; AVX512BW-FCP-NEXT:    vmovq %xmm3, (%rsi)
1034; AVX512BW-FCP-NEXT:    vmovq %xmm4, (%rdx)
1035; AVX512BW-FCP-NEXT:    vmovq %xmm5, (%rcx)
1036; AVX512BW-FCP-NEXT:    vmovq %xmm6, (%r8)
1037; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%r9)
1038; AVX512BW-FCP-NEXT:    retq
1039;
1040; AVX512DQ-BW-LABEL: load_i8_stride5_vf8:
1041; AVX512DQ-BW:       # %bb.0:
1042; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
1043; AVX512DQ-BW-NEXT:    vmovdqa 16(%rdi), %xmm1
1044; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %xmm2
1045; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u]
1046; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
1047; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
1048; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm4, %xmm3
1049; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
1050; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
1051; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
1052; AVX512DQ-BW-NEXT:    vpor %xmm4, %xmm5, %xmm4
1053; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
1054; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
1055; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
1056; AVX512DQ-BW-NEXT:    vpor %xmm5, %xmm6, %xmm5
1057; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
1058; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
1059; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
1060; AVX512DQ-BW-NEXT:    vpor %xmm6, %xmm7, %xmm6
1061; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
1062; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
1063; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
1064; AVX512DQ-BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1065; AVX512DQ-BW-NEXT:    vmovq %xmm3, (%rsi)
1066; AVX512DQ-BW-NEXT:    vmovq %xmm4, (%rdx)
1067; AVX512DQ-BW-NEXT:    vmovq %xmm5, (%rcx)
1068; AVX512DQ-BW-NEXT:    vmovq %xmm6, (%r8)
1069; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%r9)
1070; AVX512DQ-BW-NEXT:    retq
1071;
1072; AVX512DQ-BW-FCP-LABEL: load_i8_stride5_vf8:
1073; AVX512DQ-BW-FCP:       # %bb.0:
1074; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
1075; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rdi), %xmm1
1076; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
1077; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u]
1078; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
1079; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
1080; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
1081; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
1082; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
1083; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
1084; AVX512DQ-BW-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
1085; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
1086; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
1087; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
1088; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
1089; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
1090; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
1091; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
1092; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
1093; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
1094; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
1095; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
1096; AVX512DQ-BW-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
1097; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm3, (%rsi)
1098; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm4, (%rdx)
1099; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm5, (%rcx)
1100; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm6, (%r8)
1101; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%r9)
1102; AVX512DQ-BW-FCP-NEXT:    retq
1103  %wide.vec = load <40 x i8>, ptr %in.vec, align 64
1104  %strided.vec0 = shufflevector <40 x i8> %wide.vec, <40 x i8> poison, <8 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35>
1105  %strided.vec1 = shufflevector <40 x i8> %wide.vec, <40 x i8> poison, <8 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36>
1106  %strided.vec2 = shufflevector <40 x i8> %wide.vec, <40 x i8> poison, <8 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37>
1107  %strided.vec3 = shufflevector <40 x i8> %wide.vec, <40 x i8> poison, <8 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38>
1108  %strided.vec4 = shufflevector <40 x i8> %wide.vec, <40 x i8> poison, <8 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39>
1109  store <8 x i8> %strided.vec0, ptr %out.vec0, align 64
1110  store <8 x i8> %strided.vec1, ptr %out.vec1, align 64
1111  store <8 x i8> %strided.vec2, ptr %out.vec2, align 64
1112  store <8 x i8> %strided.vec3, ptr %out.vec3, align 64
1113  store <8 x i8> %strided.vec4, ptr %out.vec4, align 64
1114  ret void
1115}
1116
1117define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
1118; SSE-LABEL: load_i8_stride5_vf16:
1119; SSE:       # %bb.0:
1120; SSE-NEXT:    movdqa 64(%rdi), %xmm9
1121; SSE-NEXT:    movdqa (%rdi), %xmm1
1122; SSE-NEXT:    movdqa 16(%rdi), %xmm15
1123; SSE-NEXT:    movdqa 32(%rdi), %xmm10
1124; SSE-NEXT:    movdqa 48(%rdi), %xmm2
1125; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
1126; SSE-NEXT:    movdqa %xmm3, %xmm0
1127; SSE-NEXT:    pandn %xmm10, %xmm0
1128; SSE-NEXT:    movdqa %xmm2, %xmm4
1129; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1130; SSE-NEXT:    pand %xmm3, %xmm4
1131; SSE-NEXT:    por %xmm0, %xmm4
1132; SSE-NEXT:    pxor %xmm8, %xmm8
1133; SSE-NEXT:    movdqa %xmm4, %xmm0
1134; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
1135; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,1,1,3]
1136; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
1137; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
1138; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0]
1139; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7]
1140; SSE-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
1141; SSE-NEXT:    packuswb %xmm4, %xmm0
1142; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3]
1143; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255]
1144; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
1145; SSE-NEXT:    movdqa %xmm4, %xmm5
1146; SSE-NEXT:    pandn %xmm15, %xmm5
1147; SSE-NEXT:    movdqa %xmm1, %xmm6
1148; SSE-NEXT:    movdqa %xmm1, %xmm13
1149; SSE-NEXT:    pand %xmm4, %xmm6
1150; SSE-NEXT:    por %xmm5, %xmm6
1151; SSE-NEXT:    movdqa %xmm6, %xmm5
1152; SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
1153; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,0,0,65535,65535]
1154; SSE-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15]
1155; SSE-NEXT:    pand %xmm7, %xmm6
1156; SSE-NEXT:    pandn %xmm5, %xmm7
1157; SSE-NEXT:    por %xmm6, %xmm7
1158; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm7[0,2,1,3,4,5,6,7]
1159; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,7]
1160; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
1161; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,2,1,3,4,5,6,7]
1162; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,6,5,7]
1163; SSE-NEXT:    packuswb %xmm7, %xmm7
1164; SSE-NEXT:    pand %xmm11, %xmm7
1165; SSE-NEXT:    movdqa %xmm11, %xmm5
1166; SSE-NEXT:    pandn %xmm0, %xmm5
1167; SSE-NEXT:    por %xmm5, %xmm7
1168; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0]
1169; SSE-NEXT:    pand %xmm6, %xmm7
1170; SSE-NEXT:    movdqa %xmm9, %xmm1
1171; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
1172; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
1173; SSE-NEXT:    movdqa %xmm1, %xmm0
1174; SSE-NEXT:    movdqa %xmm1, %xmm5
1175; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1176; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[0,0]
1177; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,3]
1178; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
1179; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
1180; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
1181; SSE-NEXT:    packuswb %xmm0, %xmm0
1182; SSE-NEXT:    movdqa %xmm6, %xmm1
1183; SSE-NEXT:    pandn %xmm0, %xmm1
1184; SSE-NEXT:    por %xmm7, %xmm1
1185; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1186; SSE-NEXT:    movdqa %xmm4, %xmm7
1187; SSE-NEXT:    pandn %xmm10, %xmm7
1188; SSE-NEXT:    movdqa %xmm2, %xmm0
1189; SSE-NEXT:    pand %xmm4, %xmm0
1190; SSE-NEXT:    por %xmm7, %xmm0
1191; SSE-NEXT:    movdqa %xmm0, %xmm12
1192; SSE-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
1193; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
1194; SSE-NEXT:    movdqa %xmm0, %xmm7
1195; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,0],xmm12[0,0]
1196; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,0],xmm12[2,3]
1197; SSE-NEXT:    movdqa {{.*#+}} xmm14 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
1198; SSE-NEXT:    movdqa %xmm14, %xmm12
1199; SSE-NEXT:    movdqa %xmm13, %xmm1
1200; SSE-NEXT:    pandn %xmm13, %xmm12
1201; SSE-NEXT:    movdqa %xmm15, %xmm13
1202; SSE-NEXT:    movdqa %xmm15, %xmm2
1203; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1204; SSE-NEXT:    pand %xmm14, %xmm13
1205; SSE-NEXT:    por %xmm12, %xmm13
1206; SSE-NEXT:    movdqa %xmm13, %xmm12
1207; SSE-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
1208; SSE-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
1209; SSE-NEXT:    movdqa {{.*#+}} xmm15 = [65535,65535,0,0,65535,65535,65535,0]
1210; SSE-NEXT:    pand %xmm15, %xmm13
1211; SSE-NEXT:    pandn %xmm12, %xmm15
1212; SSE-NEXT:    por %xmm13, %xmm15
1213; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm15[0,2,1,3]
1214; SSE-NEXT:    pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5]
1215; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
1216; SSE-NEXT:    pshuflw {{.*#+}} xmm12 = xmm12[1,2,3,0,4,5,6,7]
1217; SSE-NEXT:    pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,4,5,7]
1218; SSE-NEXT:    packuswb %xmm12, %xmm12
1219; SSE-NEXT:    pand %xmm11, %xmm12
1220; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,2,1,3]
1221; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,6,7]
1222; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,2,2,1]
1223; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7]
1224; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7]
1225; SSE-NEXT:    psllq $48, %xmm0
1226; SSE-NEXT:    packuswb %xmm7, %xmm0
1227; SSE-NEXT:    movdqa %xmm5, %xmm7
1228; SSE-NEXT:    pandn %xmm0, %xmm11
1229; SSE-NEXT:    por %xmm11, %xmm12
1230; SSE-NEXT:    pand %xmm6, %xmm12
1231; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,0],xmm9[3,0]
1232; SSE-NEXT:    movaps %xmm9, %xmm0
1233; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0,2]
1234; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
1235; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
1236; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
1237; SSE-NEXT:    packuswb %xmm0, %xmm0
1238; SSE-NEXT:    movdqa %xmm6, %xmm5
1239; SSE-NEXT:    pandn %xmm0, %xmm5
1240; SSE-NEXT:    por %xmm12, %xmm5
1241; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1242; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
1243; SSE-NEXT:    movdqa %xmm12, %xmm0
1244; SSE-NEXT:    pandn %xmm1, %xmm0
1245; SSE-NEXT:    movdqa %xmm1, %xmm5
1246; SSE-NEXT:    pand %xmm12, %xmm2
1247; SSE-NEXT:    por %xmm0, %xmm2
1248; SSE-NEXT:    movdqa %xmm2, %xmm0
1249; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
1250; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,0,65535,65535,65535]
1251; SSE-NEXT:    movdqa %xmm13, %xmm15
1252; SSE-NEXT:    pandn %xmm0, %xmm15
1253; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
1254; SSE-NEXT:    pand %xmm13, %xmm2
1255; SSE-NEXT:    por %xmm15, %xmm2
1256; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[2,1,2,3,4,5,6,7]
1257; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
1258; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1259; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
1260; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
1261; SSE-NEXT:    packuswb %xmm0, %xmm0
1262; SSE-NEXT:    movdqa {{.*#+}} xmm15 = [0,0,0,65535,65535,65535,65535,65535]
1263; SSE-NEXT:    pandn %xmm0, %xmm15
1264; SSE-NEXT:    movdqa %xmm4, %xmm0
1265; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
1266; SSE-NEXT:    pandn %xmm11, %xmm0
1267; SSE-NEXT:    movdqa %xmm11, %xmm7
1268; SSE-NEXT:    pand %xmm14, %xmm7
1269; SSE-NEXT:    pandn %xmm10, %xmm14
1270; SSE-NEXT:    pand %xmm12, %xmm11
1271; SSE-NEXT:    pandn %xmm10, %xmm12
1272; SSE-NEXT:    pand %xmm4, %xmm10
1273; SSE-NEXT:    por %xmm0, %xmm10
1274; SSE-NEXT:    movdqa %xmm10, %xmm0
1275; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
1276; SSE-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
1277; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm10[0,1,2,0]
1278; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[2,0],xmm0[3,0]
1279; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[0,2]
1280; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
1281; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
1282; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1283; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
1284; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
1285; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
1286; SSE-NEXT:    packuswb %xmm0, %xmm1
1287; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1288; SSE-NEXT:    por %xmm15, %xmm1
1289; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1]
1290; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1291; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm2[0,2,2,3]
1292; SSE-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
1293; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm10[0,3,2,3,4,5,6,7]
1294; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
1295; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
1296; SSE-NEXT:    packuswb %xmm0, %xmm0
1297; SSE-NEXT:    movdqa %xmm6, %xmm10
1298; SSE-NEXT:    pandn %xmm0, %xmm10
1299; SSE-NEXT:    pand %xmm6, %xmm1
1300; SSE-NEXT:    por %xmm1, %xmm10
1301; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
1302; SSE-NEXT:    movdqa %xmm15, %xmm0
1303; SSE-NEXT:    pand %xmm3, %xmm0
1304; SSE-NEXT:    pandn %xmm5, %xmm3
1305; SSE-NEXT:    por %xmm0, %xmm3
1306; SSE-NEXT:    movdqa %xmm3, %xmm0
1307; SSE-NEXT:    pxor %xmm1, %xmm1
1308; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
1309; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1310; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,0]
1311; SSE-NEXT:    por %xmm7, %xmm14
1312; SSE-NEXT:    movdqa %xmm14, %xmm0
1313; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1314; SSE-NEXT:    punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm1[8],xmm14[9],xmm1[9],xmm14[10],xmm1[10],xmm14[11],xmm1[11],xmm14[12],xmm1[12],xmm14[13],xmm1[13],xmm14[14],xmm1[14],xmm14[15],xmm1[15]
1315; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[3,1],xmm0[2,0]
1316; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,5,6,7]
1317; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
1318; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7]
1319; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
1320; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
1321; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6]
1322; SSE-NEXT:    packuswb %xmm1, %xmm0
1323; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,0,0,65535,65535,65535,65535,65535]
1324; SSE-NEXT:    pand %xmm1, %xmm0
1325; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,5,6,5]
1326; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[3,1,2,0]
1327; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[3,0,1,2,4,5,6,7]
1328; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,4,6,7]
1329; SSE-NEXT:    packuswb %xmm7, %xmm7
1330; SSE-NEXT:    pandn %xmm7, %xmm1
1331; SSE-NEXT:    movaps %xmm9, %xmm7
1332; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,0],xmm2[3,0]
1333; SSE-NEXT:    por %xmm1, %xmm0
1334; SSE-NEXT:    movaps %xmm2, %xmm1
1335; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0,2]
1336; SSE-NEXT:    pand %xmm6, %xmm0
1337; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
1338; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
1339; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5]
1340; SSE-NEXT:    packuswb %xmm1, %xmm1
1341; SSE-NEXT:    pandn %xmm1, %xmm6
1342; SSE-NEXT:    por %xmm0, %xmm6
1343; SSE-NEXT:    por %xmm11, %xmm12
1344; SSE-NEXT:    movdqa %xmm12, %xmm1
1345; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
1346; SSE-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
1347; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[0,1,1,3]
1348; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[0,3],xmm1[1,2]
1349; SSE-NEXT:    movdqa %xmm15, %xmm1
1350; SSE-NEXT:    pand %xmm4, %xmm1
1351; SSE-NEXT:    pandn %xmm5, %xmm4
1352; SSE-NEXT:    por %xmm1, %xmm4
1353; SSE-NEXT:    movdqa %xmm4, %xmm1
1354; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
1355; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
1356; SSE-NEXT:    pand %xmm13, %xmm4
1357; SSE-NEXT:    pandn %xmm1, %xmm13
1358; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[0,2,3,1]
1359; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm12[3,1,2,3,4,5,6,7]
1360; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
1361; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1362; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,0,4,5,6,7]
1363; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
1364; SSE-NEXT:    packuswb %xmm1, %xmm0
1365; SSE-NEXT:    por %xmm4, %xmm13
1366; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,0,0,65535,65535]
1367; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm13[3,1,0,3,4,5,6,7]
1368; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,4]
1369; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
1370; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,0,4,5,6,7]
1371; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7]
1372; SSE-NEXT:    packuswb %xmm4, %xmm4
1373; SSE-NEXT:    pand %xmm3, %xmm4
1374; SSE-NEXT:    pandn %xmm0, %xmm3
1375; SSE-NEXT:    por %xmm3, %xmm4
1376; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
1377; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm9[0,2,2,3]
1378; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
1379; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
1380; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1381; SSE-NEXT:    packuswb %xmm1, %xmm2
1382; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,1]
1383; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1384; SSE-NEXT:    movaps %xmm0, (%rsi)
1385; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1386; SSE-NEXT:    movaps %xmm0, (%rdx)
1387; SSE-NEXT:    movdqa %xmm10, (%rcx)
1388; SSE-NEXT:    movdqa %xmm6, (%r8)
1389; SSE-NEXT:    movaps %xmm4, (%r9)
1390; SSE-NEXT:    retq
1391;
1392; AVX-LABEL: load_i8_stride5_vf16:
1393; AVX:       # %bb.0:
1394; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1395; AVX-NEXT:    vmovdqa 16(%rdi), %xmm2
1396; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
1397; AVX-NEXT:    vmovdqa 48(%rdi), %xmm3
1398; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
1399; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
1400; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
1401; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[2,7,12,u,u,u]
1402; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm1[u,u,u]
1403; AVX-NEXT:    vpor %xmm5, %xmm6, %xmm5
1404; AVX-NEXT:    vmovq {{.*#+}} xmm6 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
1405; AVX-NEXT:    vpblendvb %xmm6, %xmm4, %xmm5, %xmm4
1406; AVX-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
1407; AVX-NEXT:    vpshufb %xmm7, %xmm4, %xmm5
1408; AVX-NEXT:    vmovdqa 64(%rdi), %xmm4
1409; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[1,6,11]
1410; AVX-NEXT:    vpor %xmm5, %xmm8, %xmm5
1411; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[1,6,11],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
1412; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm2[0,5,10,15,u,u,u,u,u,u,u,u,u]
1413; AVX-NEXT:    vpor %xmm8, %xmm9, %xmm8
1414; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[3,8,13,u,u,u]
1415; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,4,9,14],zero,zero,zero,xmm1[u,u,u]
1416; AVX-NEXT:    vpor %xmm9, %xmm10, %xmm9
1417; AVX-NEXT:    vpblendvb %xmm6, %xmm8, %xmm9, %xmm6
1418; AVX-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
1419; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[2,7,12]
1420; AVX-NEXT:    vpor %xmm6, %xmm8, %xmm6
1421; AVX-NEXT:    vmovdqa {{.*#+}} xmm8 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u]
1422; AVX-NEXT:    vpshufb %xmm8, %xmm2, %xmm9
1423; AVX-NEXT:    vpshufb %xmm8, %xmm3, %xmm8
1424; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3,4,5,6,7]
1425; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u]
1426; AVX-NEXT:    vpshufb %xmm9, %xmm0, %xmm10
1427; AVX-NEXT:    vpshufb %xmm9, %xmm1, %xmm9
1428; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3,4,5,6,7]
1429; AVX-NEXT:    vpor %xmm8, %xmm9, %xmm8
1430; AVX-NEXT:    vpshufb %xmm7, %xmm8, %xmm8
1431; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[3,8,13]
1432; AVX-NEXT:    vpor %xmm9, %xmm8, %xmm8
1433; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [128,128,128,2,7,12,1,6,11,128,128,128,128,u,u,u]
1434; AVX-NEXT:    vpshufb %xmm9, %xmm2, %xmm10
1435; AVX-NEXT:    vpshufb %xmm9, %xmm1, %xmm9
1436; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3,4,5,6,7]
1437; AVX-NEXT:    vmovdqa {{.*#+}} xmm10 = [3,8,13,128,128,128,128,128,128,0,5,10,15,u,u,u]
1438; AVX-NEXT:    vpshufb %xmm10, %xmm0, %xmm11
1439; AVX-NEXT:    vpshufb %xmm10, %xmm3, %xmm10
1440; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4,5,6,7]
1441; AVX-NEXT:    vpor %xmm9, %xmm10, %xmm9
1442; AVX-NEXT:    vpshufb %xmm7, %xmm9, %xmm7
1443; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[4,9,14]
1444; AVX-NEXT:    vpor %xmm7, %xmm9, %xmm7
1445; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u]
1446; AVX-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
1447; AVX-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
1448; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5],xmm2[6,7]
1449; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u]
1450; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1451; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1452; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5],xmm0[6,7]
1453; AVX-NEXT:    vpor %xmm2, %xmm0, %xmm0
1454; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
1455; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
1456; AVX-NEXT:    vmovdqa %xmm5, (%rsi)
1457; AVX-NEXT:    vmovdqa %xmm6, (%rdx)
1458; AVX-NEXT:    vmovdqa %xmm8, (%rcx)
1459; AVX-NEXT:    vmovdqa %xmm7, (%r8)
1460; AVX-NEXT:    vmovdqa %xmm0, (%r9)
1461; AVX-NEXT:    retq
1462;
1463; AVX2-LABEL: load_i8_stride5_vf16:
1464; AVX2:       # %bb.0:
1465; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
1466; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
1467; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
1468; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
1469; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
1470; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u]
1471; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u]
1472; AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
1473; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
1474; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm3
1475; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm2
1476; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1,6,11]
1477; AVX2-NEXT:    vpor %xmm5, %xmm3, %xmm3
1478; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
1479; AVX2-NEXT:    vpblendvb %ymm5, %ymm0, %ymm1, %ymm5
1480; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u]
1481; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm5
1482; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u]
1483; AVX2-NEXT:    vpor %xmm6, %xmm5, %xmm5
1484; AVX2-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
1485; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2,7,12]
1486; AVX2-NEXT:    vpor %xmm6, %xmm5, %xmm5
1487; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
1488; AVX2-NEXT:    vpblendvb %ymm6, %ymm1, %ymm0, %ymm6
1489; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7
1490; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
1491; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u]
1492; AVX2-NEXT:    vpor %xmm7, %xmm6, %xmm6
1493; AVX2-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
1494; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[3,8,13]
1495; AVX2-NEXT:    vpor %xmm7, %xmm6, %xmm6
1496; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
1497; AVX2-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm7
1498; AVX2-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u]
1499; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm7
1500; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u]
1501; AVX2-NEXT:    vpor %xmm7, %xmm8, %xmm7
1502; AVX2-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
1503; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4,9,14]
1504; AVX2-NEXT:    vpor %xmm7, %xmm4, %xmm4
1505; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
1506; AVX2-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
1507; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1508; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u]
1509; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u]
1510; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
1511; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
1512; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1513; AVX2-NEXT:    vmovdqa %xmm3, (%rsi)
1514; AVX2-NEXT:    vmovdqa %xmm5, (%rdx)
1515; AVX2-NEXT:    vmovdqa %xmm6, (%rcx)
1516; AVX2-NEXT:    vmovdqa %xmm4, (%r8)
1517; AVX2-NEXT:    vmovdqa %xmm0, (%r9)
1518; AVX2-NEXT:    vzeroupper
1519; AVX2-NEXT:    retq
1520;
1521; AVX2-FP-LABEL: load_i8_stride5_vf16:
1522; AVX2-FP:       # %bb.0:
1523; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm0
1524; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm1
1525; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
1526; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
1527; AVX2-FP-NEXT:    vextracti128 $1, %ymm2, %xmm3
1528; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u]
1529; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u]
1530; AVX2-FP-NEXT:    vpor %xmm3, %xmm2, %xmm2
1531; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
1532; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm2, %xmm3
1533; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm2
1534; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1,6,11]
1535; AVX2-FP-NEXT:    vpor %xmm5, %xmm3, %xmm3
1536; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
1537; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm0, %ymm1, %ymm5
1538; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u]
1539; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm5
1540; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u]
1541; AVX2-FP-NEXT:    vpor %xmm6, %xmm5, %xmm5
1542; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
1543; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2,7,12]
1544; AVX2-FP-NEXT:    vpor %xmm6, %xmm5, %xmm5
1545; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
1546; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm1, %ymm0, %ymm6
1547; AVX2-FP-NEXT:    vextracti128 $1, %ymm6, %xmm7
1548; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
1549; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u]
1550; AVX2-FP-NEXT:    vpor %xmm7, %xmm6, %xmm6
1551; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
1552; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[3,8,13]
1553; AVX2-FP-NEXT:    vpor %xmm7, %xmm6, %xmm6
1554; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
1555; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm7
1556; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u]
1557; AVX2-FP-NEXT:    vextracti128 $1, %ymm7, %xmm7
1558; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u]
1559; AVX2-FP-NEXT:    vpor %xmm7, %xmm8, %xmm7
1560; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
1561; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4,9,14]
1562; AVX2-FP-NEXT:    vpor %xmm7, %xmm4, %xmm4
1563; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
1564; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
1565; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm1
1566; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u]
1567; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u]
1568; AVX2-FP-NEXT:    vpor %xmm1, %xmm0, %xmm0
1569; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
1570; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1571; AVX2-FP-NEXT:    vmovdqa %xmm3, (%rsi)
1572; AVX2-FP-NEXT:    vmovdqa %xmm5, (%rdx)
1573; AVX2-FP-NEXT:    vmovdqa %xmm6, (%rcx)
1574; AVX2-FP-NEXT:    vmovdqa %xmm4, (%r8)
1575; AVX2-FP-NEXT:    vmovdqa %xmm0, (%r9)
1576; AVX2-FP-NEXT:    vzeroupper
1577; AVX2-FP-NEXT:    retq
1578;
1579; AVX2-FCP-LABEL: load_i8_stride5_vf16:
1580; AVX2-FCP:       # %bb.0:
1581; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm0
1582; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
1583; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
1584; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
1585; AVX2-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
1586; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u]
1587; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u]
1588; AVX2-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
1589; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
1590; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm3
1591; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm2
1592; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1,6,11]
1593; AVX2-FCP-NEXT:    vpor %xmm5, %xmm3, %xmm3
1594; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
1595; AVX2-FCP-NEXT:    vpblendvb %ymm5, %ymm0, %ymm1, %ymm5
1596; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u]
1597; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
1598; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u]
1599; AVX2-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
1600; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
1601; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2,7,12]
1602; AVX2-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
1603; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
1604; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm1, %ymm0, %ymm6
1605; AVX2-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
1606; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
1607; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u]
1608; AVX2-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
1609; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
1610; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[3,8,13]
1611; AVX2-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
1612; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
1613; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm7
1614; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u]
1615; AVX2-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
1616; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u]
1617; AVX2-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
1618; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
1619; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4,9,14]
1620; AVX2-FCP-NEXT:    vpor %xmm7, %xmm4, %xmm4
1621; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
1622; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
1623; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
1624; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u]
1625; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u]
1626; AVX2-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
1627; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
1628; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1629; AVX2-FCP-NEXT:    vmovdqa %xmm3, (%rsi)
1630; AVX2-FCP-NEXT:    vmovdqa %xmm5, (%rdx)
1631; AVX2-FCP-NEXT:    vmovdqa %xmm6, (%rcx)
1632; AVX2-FCP-NEXT:    vmovdqa %xmm4, (%r8)
1633; AVX2-FCP-NEXT:    vmovdqa %xmm0, (%r9)
1634; AVX2-FCP-NEXT:    vzeroupper
1635; AVX2-FCP-NEXT:    retq
1636;
1637; AVX512-LABEL: load_i8_stride5_vf16:
1638; AVX512:       # %bb.0:
1639; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
1640; AVX512-NEXT:    vmovdqa (%rdi), %ymm4
1641; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm5
1642; AVX512-NEXT:    vmovdqa %ymm1, %ymm0
1643; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm5 ^ (ymm0 & (ymm4 ^ ymm5))
1644; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
1645; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12,u,u,u]
1646; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u]
1647; AVX512-NEXT:    vpor %xmm2, %xmm0, %xmm0
1648; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
1649; AVX512-NEXT:    vpshufb %xmm3, %xmm0, %xmm2
1650; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm0
1651; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,6,11]
1652; AVX512-NEXT:    vpor %xmm6, %xmm2, %xmm6
1653; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
1654; AVX512-NEXT:    vmovdqa %ymm2, %ymm7
1655; AVX512-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm4 ^ ymm5))
1656; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u]
1657; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm7
1658; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13,u,u,u]
1659; AVX512-NEXT:    vpor %xmm7, %xmm8, %xmm7
1660; AVX512-NEXT:    vpshufb %xmm3, %xmm7, %xmm7
1661; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,7,12]
1662; AVX512-NEXT:    vpor %xmm7, %xmm8, %xmm7
1663; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
1664; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm4 ^ (ymm8 & (ymm5 ^ ymm4))
1665; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm9
1666; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[1,6,11],zero,zero,zero,zero,xmm9[4,9,14,u,u,u]
1667; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[2,7,12],zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[u,u,u]
1668; AVX512-NEXT:    vpor %xmm9, %xmm8, %xmm8
1669; AVX512-NEXT:    vpshufb %xmm3, %xmm8, %xmm8
1670; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[3,8,13]
1671; AVX512-NEXT:    vpor %xmm9, %xmm8, %xmm8
1672; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm4 ^ (ymm1 & (ymm5 ^ ymm4))
1673; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[u,u,u]
1674; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
1675; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[0,5,10,15,u,u,u]
1676; AVX512-NEXT:    vpor %xmm1, %xmm9, %xmm1
1677; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1678; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[4,9,14]
1679; AVX512-NEXT:    vpor %xmm3, %xmm1, %xmm1
1680; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm4 ^ (ymm2 & (ymm5 ^ ymm4))
1681; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
1682; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
1683; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
1684; AVX512-NEXT:    vpor %xmm3, %xmm2, %xmm2
1685; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
1686; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
1687; AVX512-NEXT:    vmovdqa %xmm6, (%rsi)
1688; AVX512-NEXT:    vmovdqa %xmm7, (%rdx)
1689; AVX512-NEXT:    vmovdqa %xmm8, (%rcx)
1690; AVX512-NEXT:    vmovdqa %xmm1, (%r8)
1691; AVX512-NEXT:    vmovdqa %xmm0, (%r9)
1692; AVX512-NEXT:    vzeroupper
1693; AVX512-NEXT:    retq
1694;
1695; AVX512-FCP-LABEL: load_i8_stride5_vf16:
1696; AVX512-FCP:       # %bb.0:
1697; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
1698; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm4
1699; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
1700; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm0
1701; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm5 ^ (ymm0 & (ymm4 ^ ymm5))
1702; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
1703; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12,u,u,u]
1704; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u]
1705; AVX512-FCP-NEXT:    vpor %xmm2, %xmm0, %xmm0
1706; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
1707; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm2
1708; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm0
1709; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,6,11]
1710; AVX512-FCP-NEXT:    vpor %xmm6, %xmm2, %xmm6
1711; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
1712; AVX512-FCP-NEXT:    vmovdqa %ymm2, %ymm7
1713; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm4 ^ ymm5))
1714; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u]
1715; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
1716; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13,u,u,u]
1717; AVX512-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
1718; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm7, %xmm7
1719; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,7,12]
1720; AVX512-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
1721; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
1722; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm4 ^ (ymm8 & (ymm5 ^ ymm4))
1723; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
1724; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[1,6,11],zero,zero,zero,zero,xmm9[4,9,14,u,u,u]
1725; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[2,7,12],zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[u,u,u]
1726; AVX512-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm8
1727; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm8, %xmm8
1728; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[3,8,13]
1729; AVX512-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm8
1730; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm4 ^ (ymm1 & (ymm5 ^ ymm4))
1731; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[u,u,u]
1732; AVX512-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
1733; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[0,5,10,15,u,u,u]
1734; AVX512-FCP-NEXT:    vpor %xmm1, %xmm9, %xmm1
1735; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1736; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[4,9,14]
1737; AVX512-FCP-NEXT:    vpor %xmm3, %xmm1, %xmm1
1738; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm4 ^ (ymm2 & (ymm5 ^ ymm4))
1739; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
1740; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
1741; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
1742; AVX512-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
1743; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
1744; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
1745; AVX512-FCP-NEXT:    vmovdqa %xmm6, (%rsi)
1746; AVX512-FCP-NEXT:    vmovdqa %xmm7, (%rdx)
1747; AVX512-FCP-NEXT:    vmovdqa %xmm8, (%rcx)
1748; AVX512-FCP-NEXT:    vmovdqa %xmm1, (%r8)
1749; AVX512-FCP-NEXT:    vmovdqa %xmm0, (%r9)
1750; AVX512-FCP-NEXT:    vzeroupper
1751; AVX512-FCP-NEXT:    retq
1752;
1753; AVX512DQ-LABEL: load_i8_stride5_vf16:
1754; AVX512DQ:       # %bb.0:
1755; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
1756; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm4
1757; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm5
1758; AVX512DQ-NEXT:    vmovdqa %ymm1, %ymm0
1759; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm5 ^ (ymm0 & (ymm4 ^ ymm5))
1760; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
1761; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12,u,u,u]
1762; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u]
1763; AVX512DQ-NEXT:    vpor %xmm2, %xmm0, %xmm0
1764; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
1765; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm0, %xmm2
1766; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm0
1767; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,6,11]
1768; AVX512DQ-NEXT:    vpor %xmm6, %xmm2, %xmm6
1769; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
1770; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm7
1771; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm4 ^ ymm5))
1772; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u]
1773; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm7
1774; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13,u,u,u]
1775; AVX512DQ-NEXT:    vpor %xmm7, %xmm8, %xmm7
1776; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm7, %xmm7
1777; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,7,12]
1778; AVX512DQ-NEXT:    vpor %xmm7, %xmm8, %xmm7
1779; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
1780; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm4 ^ (ymm8 & (ymm5 ^ ymm4))
1781; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm9
1782; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[1,6,11],zero,zero,zero,zero,xmm9[4,9,14,u,u,u]
1783; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[2,7,12],zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[u,u,u]
1784; AVX512DQ-NEXT:    vpor %xmm9, %xmm8, %xmm8
1785; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm8, %xmm8
1786; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[3,8,13]
1787; AVX512DQ-NEXT:    vpor %xmm9, %xmm8, %xmm8
1788; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm4 ^ (ymm1 & (ymm5 ^ ymm4))
1789; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[u,u,u]
1790; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm1
1791; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[0,5,10,15,u,u,u]
1792; AVX512DQ-NEXT:    vpor %xmm1, %xmm9, %xmm1
1793; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1794; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[4,9,14]
1795; AVX512DQ-NEXT:    vpor %xmm3, %xmm1, %xmm1
1796; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm4 ^ (ymm2 & (ymm5 ^ ymm4))
1797; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
1798; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
1799; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
1800; AVX512DQ-NEXT:    vpor %xmm3, %xmm2, %xmm2
1801; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
1802; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
1803; AVX512DQ-NEXT:    vmovdqa %xmm6, (%rsi)
1804; AVX512DQ-NEXT:    vmovdqa %xmm7, (%rdx)
1805; AVX512DQ-NEXT:    vmovdqa %xmm8, (%rcx)
1806; AVX512DQ-NEXT:    vmovdqa %xmm1, (%r8)
1807; AVX512DQ-NEXT:    vmovdqa %xmm0, (%r9)
1808; AVX512DQ-NEXT:    vzeroupper
1809; AVX512DQ-NEXT:    retq
1810;
1811; AVX512DQ-FCP-LABEL: load_i8_stride5_vf16:
1812; AVX512DQ-FCP:       # %bb.0:
1813; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
1814; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm4
1815; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
1816; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, %ymm0
1817; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm5 ^ (ymm0 & (ymm4 ^ ymm5))
1818; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm2
1819; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12,u,u,u]
1820; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u]
1821; AVX512DQ-FCP-NEXT:    vpor %xmm2, %xmm0, %xmm0
1822; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
1823; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm2
1824; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm0
1825; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,6,11]
1826; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm2, %xmm6
1827; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
1828; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm7
1829; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm4 ^ ymm5))
1830; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u]
1831; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
1832; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13,u,u,u]
1833; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
1834; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm7, %xmm7
1835; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,7,12]
1836; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
1837; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
1838; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm4 ^ (ymm8 & (ymm5 ^ ymm4))
1839; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
1840; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[1,6,11],zero,zero,zero,zero,xmm9[4,9,14,u,u,u]
1841; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[2,7,12],zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[u,u,u]
1842; AVX512DQ-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm8
1843; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm8, %xmm8
1844; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[3,8,13]
1845; AVX512DQ-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm8
1846; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm4 ^ (ymm1 & (ymm5 ^ ymm4))
1847; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[u,u,u]
1848; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm1
1849; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[0,5,10,15,u,u,u]
1850; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm9, %xmm1
1851; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1852; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[4,9,14]
1853; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm1, %xmm1
1854; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm4 ^ (ymm2 & (ymm5 ^ ymm4))
1855; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
1856; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
1857; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
1858; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
1859; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
1860; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
1861; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, (%rsi)
1862; AVX512DQ-FCP-NEXT:    vmovdqa %xmm7, (%rdx)
1863; AVX512DQ-FCP-NEXT:    vmovdqa %xmm8, (%rcx)
1864; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, (%r8)
1865; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%r9)
1866; AVX512DQ-FCP-NEXT:    vzeroupper
1867; AVX512DQ-FCP-NEXT:    retq
1868;
1869; AVX512BW-LABEL: load_i8_stride5_vf16:
1870; AVX512BW:       # %bb.0:
1871; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
1872; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm1
1873; AVX512BW-NEXT:    movw $19026, %ax # imm = 0x4A52
1874; AVX512BW-NEXT:    kmovd %eax, %k1
1875; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm0, %ymm2 {%k1}
1876; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
1877; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u]
1878; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u]
1879; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
1880; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
1881; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1882; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm4
1883; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[1,6,11]
1884; AVX512BW-NEXT:    vpor %xmm5, %xmm2, %xmm2
1885; AVX512BW-NEXT:    movw $21140, %ax # imm = 0x5294
1886; AVX512BW-NEXT:    kmovd %eax, %k2
1887; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm0, %ymm5 {%k2}
1888; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u]
1889; AVX512BW-NEXT:    vextracti128 $1, %ymm5, %xmm5
1890; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u]
1891; AVX512BW-NEXT:    vpor %xmm6, %xmm5, %xmm5
1892; AVX512BW-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
1893; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[2,7,12]
1894; AVX512BW-NEXT:    vpor %xmm6, %xmm5, %xmm5
1895; AVX512BW-NEXT:    movw $10570, %ax # imm = 0x294A
1896; AVX512BW-NEXT:    kmovd %eax, %k3
1897; AVX512BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm6 {%k3}
1898; AVX512BW-NEXT:    vextracti128 $1, %ymm6, %xmm7
1899; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
1900; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u]
1901; AVX512BW-NEXT:    vpor %xmm7, %xmm6, %xmm6
1902; AVX512BW-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
1903; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[3,8,13]
1904; AVX512BW-NEXT:    vpor %xmm7, %xmm6, %xmm6
1905; AVX512BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm7 {%k1}
1906; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u]
1907; AVX512BW-NEXT:    vextracti128 $1, %ymm7, %xmm7
1908; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u]
1909; AVX512BW-NEXT:    vpor %xmm7, %xmm8, %xmm7
1910; AVX512BW-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
1911; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[4,9,14]
1912; AVX512BW-NEXT:    vpor %xmm7, %xmm3, %xmm3
1913; AVX512BW-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k2}
1914; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm0
1915; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11,u,u,u,u]
1916; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,9,14],zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[u,u,u,u]
1917; AVX512BW-NEXT:    vpor %xmm0, %xmm1, %xmm0
1918; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
1919; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1920; AVX512BW-NEXT:    vmovdqa %xmm2, (%rsi)
1921; AVX512BW-NEXT:    vmovdqa %xmm5, (%rdx)
1922; AVX512BW-NEXT:    vmovdqa %xmm6, (%rcx)
1923; AVX512BW-NEXT:    vmovdqa %xmm3, (%r8)
1924; AVX512BW-NEXT:    vmovdqa %xmm0, (%r9)
1925; AVX512BW-NEXT:    vzeroupper
1926; AVX512BW-NEXT:    retq
1927;
1928; AVX512BW-FCP-LABEL: load_i8_stride5_vf16:
1929; AVX512BW-FCP:       # %bb.0:
1930; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm0
1931; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
1932; AVX512BW-FCP-NEXT:    movw $19026, %ax # imm = 0x4A52
1933; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
1934; AVX512BW-FCP-NEXT:    vpblendmw %ymm1, %ymm0, %ymm2 {%k1}
1935; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
1936; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u]
1937; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u]
1938; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
1939; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
1940; AVX512BW-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1941; AVX512BW-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
1942; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[1,6,11]
1943; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm2, %xmm2
1944; AVX512BW-FCP-NEXT:    movw $21140, %ax # imm = 0x5294
1945; AVX512BW-FCP-NEXT:    kmovd %eax, %k2
1946; AVX512BW-FCP-NEXT:    vpblendmw %ymm1, %ymm0, %ymm5 {%k2}
1947; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u]
1948; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
1949; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u]
1950; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
1951; AVX512BW-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
1952; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[2,7,12]
1953; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
1954; AVX512BW-FCP-NEXT:    movw $10570, %ax # imm = 0x294A
1955; AVX512BW-FCP-NEXT:    kmovd %eax, %k3
1956; AVX512BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm6 {%k3}
1957; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
1958; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
1959; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u]
1960; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
1961; AVX512BW-FCP-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
1962; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[3,8,13]
1963; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
1964; AVX512BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm7 {%k1}
1965; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u]
1966; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
1967; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u]
1968; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
1969; AVX512BW-FCP-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
1970; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[4,9,14]
1971; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm3, %xmm3
1972; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k2}
1973; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm0
1974; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11,u,u,u,u]
1975; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,9,14],zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[u,u,u,u]
1976; AVX512BW-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
1977; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
1978; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1979; AVX512BW-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
1980; AVX512BW-FCP-NEXT:    vmovdqa %xmm5, (%rdx)
1981; AVX512BW-FCP-NEXT:    vmovdqa %xmm6, (%rcx)
1982; AVX512BW-FCP-NEXT:    vmovdqa %xmm3, (%r8)
1983; AVX512BW-FCP-NEXT:    vmovdqa %xmm0, (%r9)
1984; AVX512BW-FCP-NEXT:    vzeroupper
1985; AVX512BW-FCP-NEXT:    retq
1986;
1987; AVX512DQ-BW-LABEL: load_i8_stride5_vf16:
1988; AVX512DQ-BW:       # %bb.0:
1989; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm0
1990; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm1
1991; AVX512DQ-BW-NEXT:    movw $19026, %ax # imm = 0x4A52
1992; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
1993; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm0, %ymm2 {%k1}
1994; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
1995; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u]
1996; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u]
1997; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
1998; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
1999; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2000; AVX512DQ-BW-NEXT:    vmovdqa 64(%rdi), %xmm4
2001; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[1,6,11]
2002; AVX512DQ-BW-NEXT:    vpor %xmm5, %xmm2, %xmm2
2003; AVX512DQ-BW-NEXT:    movw $21140, %ax # imm = 0x5294
2004; AVX512DQ-BW-NEXT:    kmovd %eax, %k2
2005; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm0, %ymm5 {%k2}
2006; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u]
2007; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm5, %xmm5
2008; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u]
2009; AVX512DQ-BW-NEXT:    vpor %xmm6, %xmm5, %xmm5
2010; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
2011; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[2,7,12]
2012; AVX512DQ-BW-NEXT:    vpor %xmm6, %xmm5, %xmm5
2013; AVX512DQ-BW-NEXT:    movw $10570, %ax # imm = 0x294A
2014; AVX512DQ-BW-NEXT:    kmovd %eax, %k3
2015; AVX512DQ-BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm6 {%k3}
2016; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm6, %xmm7
2017; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
2018; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u]
2019; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm6, %xmm6
2020; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
2021; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[3,8,13]
2022; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm6, %xmm6
2023; AVX512DQ-BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm7 {%k1}
2024; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u]
2025; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm7, %xmm7
2026; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u]
2027; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm8, %xmm7
2028; AVX512DQ-BW-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
2029; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[4,9,14]
2030; AVX512DQ-BW-NEXT:    vpor %xmm7, %xmm3, %xmm3
2031; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k2}
2032; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm1, %xmm0
2033; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11,u,u,u,u]
2034; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,9,14],zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[u,u,u,u]
2035; AVX512DQ-BW-NEXT:    vpor %xmm0, %xmm1, %xmm0
2036; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
2037; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
2038; AVX512DQ-BW-NEXT:    vmovdqa %xmm2, (%rsi)
2039; AVX512DQ-BW-NEXT:    vmovdqa %xmm5, (%rdx)
2040; AVX512DQ-BW-NEXT:    vmovdqa %xmm6, (%rcx)
2041; AVX512DQ-BW-NEXT:    vmovdqa %xmm3, (%r8)
2042; AVX512DQ-BW-NEXT:    vmovdqa %xmm0, (%r9)
2043; AVX512DQ-BW-NEXT:    vzeroupper
2044; AVX512DQ-BW-NEXT:    retq
2045;
2046; AVX512DQ-BW-FCP-LABEL: load_i8_stride5_vf16:
2047; AVX512DQ-BW-FCP:       # %bb.0:
2048; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm0
2049; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
2050; AVX512DQ-BW-FCP-NEXT:    movw $19026, %ax # imm = 0x4A52
2051; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
2052; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm1, %ymm0, %ymm2 {%k1}
2053; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
2054; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u]
2055; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u]
2056; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
2057; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
2058; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2059; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
2060; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[1,6,11]
2061; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm2, %xmm2
2062; AVX512DQ-BW-FCP-NEXT:    movw $21140, %ax # imm = 0x5294
2063; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k2
2064; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm1, %ymm0, %ymm5 {%k2}
2065; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u]
2066; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
2067; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u]
2068; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
2069; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
2070; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[2,7,12]
2071; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
2072; AVX512DQ-BW-FCP-NEXT:    movw $10570, %ax # imm = 0x294A
2073; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k3
2074; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm6 {%k3}
2075; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm7
2076; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
2077; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u]
2078; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
2079; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
2080; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[3,8,13]
2081; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
2082; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm7 {%k1}
2083; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u]
2084; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm7
2085; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u]
2086; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
2087; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
2088; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[4,9,14]
2089; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm3, %xmm3
2090; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k2}
2091; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm0
2092; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11,u,u,u,u]
2093; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,9,14],zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[u,u,u,u]
2094; AVX512DQ-BW-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
2095; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
2096; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
2097; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm2, (%rsi)
2098; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm5, (%rdx)
2099; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm6, (%rcx)
2100; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm3, (%r8)
2101; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm0, (%r9)
2102; AVX512DQ-BW-FCP-NEXT:    vzeroupper
2103; AVX512DQ-BW-FCP-NEXT:    retq
2104  %wide.vec = load <80 x i8>, ptr %in.vec, align 64
2105  %strided.vec0 = shufflevector <80 x i8> %wide.vec, <80 x i8> poison, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75>
2106  %strided.vec1 = shufflevector <80 x i8> %wide.vec, <80 x i8> poison, <16 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36, i32 41, i32 46, i32 51, i32 56, i32 61, i32 66, i32 71, i32 76>
2107  %strided.vec2 = shufflevector <80 x i8> %wide.vec, <80 x i8> poison, <16 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37, i32 42, i32 47, i32 52, i32 57, i32 62, i32 67, i32 72, i32 77>
2108  %strided.vec3 = shufflevector <80 x i8> %wide.vec, <80 x i8> poison, <16 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38, i32 43, i32 48, i32 53, i32 58, i32 63, i32 68, i32 73, i32 78>
2109  %strided.vec4 = shufflevector <80 x i8> %wide.vec, <80 x i8> poison, <16 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39, i32 44, i32 49, i32 54, i32 59, i32 64, i32 69, i32 74, i32 79>
2110  store <16 x i8> %strided.vec0, ptr %out.vec0, align 64
2111  store <16 x i8> %strided.vec1, ptr %out.vec1, align 64
2112  store <16 x i8> %strided.vec2, ptr %out.vec2, align 64
2113  store <16 x i8> %strided.vec3, ptr %out.vec3, align 64
2114  store <16 x i8> %strided.vec4, ptr %out.vec4, align 64
2115  ret void
2116}
2117
2118define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
2119; SSE-LABEL: load_i8_stride5_vf32:
2120; SSE:       # %bb.0:
2121; SSE-NEXT:    subq $184, %rsp
2122; SSE-NEXT:    movdqa (%rdi), %xmm9
2123; SSE-NEXT:    movdqa 16(%rdi), %xmm3
2124; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2125; SSE-NEXT:    movdqa 32(%rdi), %xmm1
2126; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2127; SSE-NEXT:    movdqa 48(%rdi), %xmm2
2128; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2129; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
2130; SSE-NEXT:    movdqa %xmm4, %xmm0
2131; SSE-NEXT:    pandn %xmm1, %xmm0
2132; SSE-NEXT:    movdqa %xmm2, %xmm1
2133; SSE-NEXT:    pand %xmm4, %xmm1
2134; SSE-NEXT:    por %xmm0, %xmm1
2135; SSE-NEXT:    pxor %xmm5, %xmm5
2136; SSE-NEXT:    movdqa %xmm1, %xmm0
2137; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
2138; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
2139; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
2140; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
2141; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
2142; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
2143; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2144; SSE-NEXT:    packuswb %xmm1, %xmm0
2145; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,3]
2146; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255]
2147; SSE-NEXT:    movdqa %xmm13, %xmm0
2148; SSE-NEXT:    pandn %xmm1, %xmm0
2149; SSE-NEXT:    movdqa {{.*#+}} xmm15 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
2150; SSE-NEXT:    movdqa %xmm15, %xmm1
2151; SSE-NEXT:    pandn %xmm3, %xmm1
2152; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
2153; SSE-NEXT:    pandn %xmm9, %xmm11
2154; SSE-NEXT:    movdqa {{.*#+}} xmm14 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
2155; SSE-NEXT:    movdqa %xmm14, %xmm2
2156; SSE-NEXT:    pandn %xmm9, %xmm2
2157; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2158; SSE-NEXT:    movdqa %xmm4, %xmm2
2159; SSE-NEXT:    pandn %xmm9, %xmm2
2160; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2161; SSE-NEXT:    movdqa %xmm15, %xmm2
2162; SSE-NEXT:    pandn %xmm9, %xmm2
2163; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2164; SSE-NEXT:    pand %xmm15, %xmm9
2165; SSE-NEXT:    por %xmm1, %xmm9
2166; SSE-NEXT:    movdqa %xmm9, %xmm2
2167; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
2168; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,0,65535,65535]
2169; SSE-NEXT:    movdqa %xmm1, %xmm6
2170; SSE-NEXT:    pandn %xmm2, %xmm6
2171; SSE-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15]
2172; SSE-NEXT:    pand %xmm1, %xmm9
2173; SSE-NEXT:    por %xmm6, %xmm9
2174; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm9[0,2,1,3,4,5,6,7]
2175; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7]
2176; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
2177; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7]
2178; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7]
2179; SSE-NEXT:    packuswb %xmm2, %xmm2
2180; SSE-NEXT:    pand %xmm13, %xmm2
2181; SSE-NEXT:    por %xmm0, %xmm2
2182; SSE-NEXT:    movdqa 64(%rdi), %xmm6
2183; SSE-NEXT:    movdqa %xmm6, %xmm3
2184; SSE-NEXT:    pxor %xmm0, %xmm0
2185; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2186; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2187; SSE-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
2188; SSE-NEXT:    movdqa %xmm6, %xmm0
2189; SSE-NEXT:    movdqa %xmm6, %xmm8
2190; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2191; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[0,0]
2192; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[2,3]
2193; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
2194; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
2195; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
2196; SSE-NEXT:    packuswb %xmm0, %xmm0
2197; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0]
2198; SSE-NEXT:    movdqa %xmm9, %xmm6
2199; SSE-NEXT:    pandn %xmm0, %xmm6
2200; SSE-NEXT:    pand %xmm9, %xmm2
2201; SSE-NEXT:    por %xmm2, %xmm6
2202; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2203; SSE-NEXT:    movdqa 112(%rdi), %xmm10
2204; SSE-NEXT:    movdqa %xmm4, %xmm0
2205; SSE-NEXT:    pandn %xmm10, %xmm0
2206; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2207; SSE-NEXT:    movdqa 128(%rdi), %xmm7
2208; SSE-NEXT:    movdqa %xmm7, %xmm2
2209; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2210; SSE-NEXT:    pand %xmm4, %xmm2
2211; SSE-NEXT:    por %xmm0, %xmm2
2212; SSE-NEXT:    movdqa %xmm2, %xmm0
2213; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
2214; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,1,1,3]
2215; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7]
2216; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
2217; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
2218; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
2219; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3]
2220; SSE-NEXT:    packuswb %xmm2, %xmm0
2221; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3]
2222; SSE-NEXT:    movdqa %xmm13, %xmm2
2223; SSE-NEXT:    movdqa %xmm13, %xmm3
2224; SSE-NEXT:    pandn %xmm0, %xmm2
2225; SSE-NEXT:    movdqa 96(%rdi), %xmm4
2226; SSE-NEXT:    movdqa %xmm4, (%rsp) # 16-byte Spill
2227; SSE-NEXT:    movdqa %xmm15, %xmm0
2228; SSE-NEXT:    pandn %xmm4, %xmm0
2229; SSE-NEXT:    movdqa 80(%rdi), %xmm6
2230; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2231; SSE-NEXT:    pand %xmm15, %xmm6
2232; SSE-NEXT:    por %xmm0, %xmm6
2233; SSE-NEXT:    movdqa %xmm6, %xmm0
2234; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
2235; SSE-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
2236; SSE-NEXT:    pand %xmm1, %xmm6
2237; SSE-NEXT:    pandn %xmm0, %xmm1
2238; SSE-NEXT:    por %xmm6, %xmm1
2239; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,1,3,4,5,6,7]
2240; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
2241; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
2242; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
2243; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
2244; SSE-NEXT:    packuswb %xmm0, %xmm0
2245; SSE-NEXT:    pand %xmm13, %xmm0
2246; SSE-NEXT:    por %xmm2, %xmm0
2247; SSE-NEXT:    movdqa 144(%rdi), %xmm12
2248; SSE-NEXT:    movdqa %xmm12, %xmm2
2249; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
2250; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2251; SSE-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15]
2252; SSE-NEXT:    movdqa %xmm12, %xmm1
2253; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2254; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0]
2255; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3]
2256; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
2257; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
2258; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4]
2259; SSE-NEXT:    packuswb %xmm1, %xmm1
2260; SSE-NEXT:    movdqa %xmm9, %xmm2
2261; SSE-NEXT:    pandn %xmm1, %xmm2
2262; SSE-NEXT:    pand %xmm9, %xmm0
2263; SSE-NEXT:    por %xmm0, %xmm2
2264; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2265; SSE-NEXT:    movdqa %xmm15, %xmm0
2266; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2267; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2268; SSE-NEXT:    pand %xmm15, %xmm1
2269; SSE-NEXT:    por %xmm0, %xmm1
2270; SSE-NEXT:    movdqa %xmm1, %xmm0
2271; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
2272; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
2273; SSE-NEXT:    movdqa %xmm1, %xmm2
2274; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
2275; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
2276; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
2277; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,5,6,7]
2278; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,1]
2279; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
2280; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
2281; SSE-NEXT:    psllq $48, %xmm1
2282; SSE-NEXT:    packuswb %xmm0, %xmm1
2283; SSE-NEXT:    movdqa %xmm13, %xmm2
2284; SSE-NEXT:    pandn %xmm1, %xmm2
2285; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2286; SSE-NEXT:    movdqa %xmm4, %xmm1
2287; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
2288; SSE-NEXT:    pand %xmm13, %xmm1
2289; SSE-NEXT:    por %xmm11, %xmm1
2290; SSE-NEXT:    movdqa %xmm1, %xmm6
2291; SSE-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
2292; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,0,0,65535,65535,65535,0]
2293; SSE-NEXT:    movdqa %xmm0, %xmm11
2294; SSE-NEXT:    pandn %xmm6, %xmm11
2295; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
2296; SSE-NEXT:    pand %xmm0, %xmm1
2297; SSE-NEXT:    por %xmm11, %xmm1
2298; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
2299; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
2300; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
2301; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
2302; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,5,7]
2303; SSE-NEXT:    packuswb %xmm1, %xmm1
2304; SSE-NEXT:    pand %xmm3, %xmm1
2305; SSE-NEXT:    movdqa %xmm3, %xmm11
2306; SSE-NEXT:    por %xmm2, %xmm1
2307; SSE-NEXT:    movdqa %xmm8, %xmm2
2308; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2309; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm6[3,0]
2310; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,1],xmm2[0,2]
2311; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,7,6,7]
2312; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
2313; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
2314; SSE-NEXT:    packuswb %xmm2, %xmm2
2315; SSE-NEXT:    movdqa %xmm9, %xmm3
2316; SSE-NEXT:    pandn %xmm2, %xmm3
2317; SSE-NEXT:    pand %xmm9, %xmm1
2318; SSE-NEXT:    por %xmm1, %xmm3
2319; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2320; SSE-NEXT:    movdqa %xmm15, %xmm2
2321; SSE-NEXT:    pandn %xmm10, %xmm2
2322; SSE-NEXT:    movdqa %xmm7, %xmm1
2323; SSE-NEXT:    pand %xmm15, %xmm1
2324; SSE-NEXT:    por %xmm2, %xmm1
2325; SSE-NEXT:    movdqa %xmm1, %xmm2
2326; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
2327; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
2328; SSE-NEXT:    movdqa %xmm1, %xmm6
2329; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[1,0],xmm2[0,0]
2330; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,3]
2331; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,2,1,3]
2332; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,6,7]
2333; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,1]
2334; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
2335; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
2336; SSE-NEXT:    psllq $48, %xmm1
2337; SSE-NEXT:    packuswb %xmm2, %xmm1
2338; SSE-NEXT:    movdqa %xmm13, %xmm2
2339; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
2340; SSE-NEXT:    pandn %xmm7, %xmm2
2341; SSE-NEXT:    movdqa (%rsp), %xmm8 # 16-byte Reload
2342; SSE-NEXT:    movdqa %xmm8, %xmm6
2343; SSE-NEXT:    pand %xmm13, %xmm6
2344; SSE-NEXT:    por %xmm2, %xmm6
2345; SSE-NEXT:    movdqa %xmm6, %xmm2
2346; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
2347; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
2348; SSE-NEXT:    pand %xmm0, %xmm6
2349; SSE-NEXT:    pandn %xmm2, %xmm0
2350; SSE-NEXT:    por %xmm6, %xmm0
2351; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2352; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
2353; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
2354; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
2355; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,7]
2356; SSE-NEXT:    packuswb %xmm0, %xmm0
2357; SSE-NEXT:    movdqa %xmm11, %xmm2
2358; SSE-NEXT:    pand %xmm11, %xmm0
2359; SSE-NEXT:    pandn %xmm1, %xmm2
2360; SSE-NEXT:    por %xmm2, %xmm0
2361; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2362; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[2,0],xmm2[3,0]
2363; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm12[0,2]
2364; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,7,6,7]
2365; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
2366; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,4]
2367; SSE-NEXT:    packuswb %xmm1, %xmm1
2368; SSE-NEXT:    movdqa %xmm9, %xmm2
2369; SSE-NEXT:    pandn %xmm1, %xmm2
2370; SSE-NEXT:    pand %xmm9, %xmm0
2371; SSE-NEXT:    por %xmm0, %xmm2
2372; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2373; SSE-NEXT:    pand %xmm14, %xmm4
2374; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
2375; SSE-NEXT:    movdqa %xmm4, %xmm2
2376; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
2377; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,0,65535,65535,65535]
2378; SSE-NEXT:    movdqa %xmm3, %xmm6
2379; SSE-NEXT:    pandn %xmm2, %xmm6
2380; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
2381; SSE-NEXT:    pand %xmm3, %xmm4
2382; SSE-NEXT:    por %xmm6, %xmm4
2383; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm4[2,1,2,3,4,5,6,7]
2384; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
2385; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2386; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
2387; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2388; SSE-NEXT:    packuswb %xmm0, %xmm0
2389; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,0,0,65535,65535,65535,65535,65535]
2390; SSE-NEXT:    movdqa %xmm1, %xmm2
2391; SSE-NEXT:    movdqa %xmm1, %xmm10
2392; SSE-NEXT:    pandn %xmm0, %xmm2
2393; SSE-NEXT:    movdqa %xmm15, %xmm0
2394; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2395; SSE-NEXT:    pandn %xmm1, %xmm0
2396; SSE-NEXT:    movdqa %xmm13, %xmm6
2397; SSE-NEXT:    movdqa %xmm13, %xmm12
2398; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
2399; SSE-NEXT:    pandn %xmm11, %xmm6
2400; SSE-NEXT:    movdqa %xmm14, %xmm4
2401; SSE-NEXT:    pandn %xmm11, %xmm4
2402; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2403; SSE-NEXT:    pand %xmm15, %xmm11
2404; SSE-NEXT:    movdqa %xmm15, %xmm4
2405; SSE-NEXT:    por %xmm0, %xmm11
2406; SSE-NEXT:    movdqa %xmm11, %xmm0
2407; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
2408; SSE-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7]
2409; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm11[0,1,2,0]
2410; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[3,0]
2411; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm11[0,2]
2412; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
2413; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
2414; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2415; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
2416; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
2417; SSE-NEXT:    pshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,4,5,6,5]
2418; SSE-NEXT:    packuswb %xmm0, %xmm11
2419; SSE-NEXT:    pand %xmm10, %xmm11
2420; SSE-NEXT:    por %xmm2, %xmm11
2421; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2422; SSE-NEXT:    # xmm0 = mem[1,1,1,1]
2423; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2424; SSE-NEXT:    # xmm2 = mem[0,2,2,3]
2425; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2426; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7]
2427; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
2428; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
2429; SSE-NEXT:    packuswb %xmm0, %xmm0
2430; SSE-NEXT:    movdqa %xmm9, %xmm2
2431; SSE-NEXT:    pandn %xmm0, %xmm2
2432; SSE-NEXT:    pand %xmm9, %xmm11
2433; SSE-NEXT:    por %xmm11, %xmm2
2434; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2435; SSE-NEXT:    movdqa %xmm14, %xmm0
2436; SSE-NEXT:    pandn %xmm7, %xmm0
2437; SSE-NEXT:    movdqa %xmm8, %xmm15
2438; SSE-NEXT:    movdqa %xmm8, %xmm2
2439; SSE-NEXT:    pand %xmm14, %xmm2
2440; SSE-NEXT:    por %xmm0, %xmm2
2441; SSE-NEXT:    movdqa %xmm2, %xmm0
2442; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
2443; SSE-NEXT:    movdqa %xmm3, %xmm11
2444; SSE-NEXT:    pandn %xmm0, %xmm11
2445; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
2446; SSE-NEXT:    pand %xmm3, %xmm2
2447; SSE-NEXT:    por %xmm11, %xmm2
2448; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[2,1,2,3,4,5,6,7]
2449; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
2450; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2451; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
2452; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2453; SSE-NEXT:    packuswb %xmm0, %xmm0
2454; SSE-NEXT:    movdqa %xmm10, %xmm13
2455; SSE-NEXT:    pandn %xmm0, %xmm13
2456; SSE-NEXT:    movdqa %xmm4, %xmm11
2457; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2458; SSE-NEXT:    pandn %xmm2, %xmm11
2459; SSE-NEXT:    movdqa %xmm1, %xmm5
2460; SSE-NEXT:    movdqa %xmm1, %xmm0
2461; SSE-NEXT:    movdqa %xmm12, %xmm1
2462; SSE-NEXT:    pand %xmm12, %xmm0
2463; SSE-NEXT:    movdqa %xmm2, %xmm7
2464; SSE-NEXT:    pand %xmm12, %xmm7
2465; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2466; SSE-NEXT:    pandn %xmm8, %xmm1
2467; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2468; SSE-NEXT:    pand %xmm14, %xmm5
2469; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2470; SSE-NEXT:    pand %xmm14, %xmm2
2471; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2472; SSE-NEXT:    pandn %xmm8, %xmm14
2473; SSE-NEXT:    pand %xmm4, %xmm8
2474; SSE-NEXT:    por %xmm11, %xmm8
2475; SSE-NEXT:    movdqa %xmm8, %xmm11
2476; SSE-NEXT:    pxor %xmm1, %xmm1
2477; SSE-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15]
2478; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
2479; SSE-NEXT:    pxor %xmm2, %xmm2
2480; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[0,1,2,0]
2481; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm11[3,0]
2482; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[0,1],xmm8[0,2]
2483; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm11[2,1,2,3,4,5,6,7]
2484; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7]
2485; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
2486; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[0,3,2,1,4,5,6,7]
2487; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6]
2488; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
2489; SSE-NEXT:    packuswb %xmm8, %xmm1
2490; SSE-NEXT:    pand %xmm10, %xmm1
2491; SSE-NEXT:    por %xmm13, %xmm1
2492; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2493; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[1,1,1,1]
2494; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2495; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm5[0,2,2,3]
2496; SSE-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1]
2497; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm11[0,3,2,3,4,5,6,7]
2498; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0]
2499; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5]
2500; SSE-NEXT:    packuswb %xmm8, %xmm11
2501; SSE-NEXT:    movdqa %xmm9, %xmm12
2502; SSE-NEXT:    pandn %xmm11, %xmm12
2503; SSE-NEXT:    pand %xmm9, %xmm1
2504; SSE-NEXT:    por %xmm1, %xmm12
2505; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2506; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
2507; SSE-NEXT:    pand %xmm13, %xmm1
2508; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2509; SSE-NEXT:    movdqa %xmm1, %xmm11
2510; SSE-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15]
2511; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2512; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm11[2,0]
2513; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
2514; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
2515; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7]
2516; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,6,7]
2517; SSE-NEXT:    packuswb %xmm1, %xmm1
2518; SSE-NEXT:    movdqa %xmm10, %xmm11
2519; SSE-NEXT:    pandn %xmm1, %xmm11
2520; SSE-NEXT:    por %xmm6, %xmm0
2521; SSE-NEXT:    movdqa %xmm0, %xmm1
2522; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2523; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
2524; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0]
2525; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2526; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
2527; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7]
2528; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
2529; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
2530; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6]
2531; SSE-NEXT:    packuswb %xmm0, %xmm1
2532; SSE-NEXT:    pand %xmm10, %xmm1
2533; SSE-NEXT:    por %xmm11, %xmm1
2534; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
2535; SSE-NEXT:    movaps %xmm10, %xmm0
2536; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2537; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[3,0]
2538; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2]
2539; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,6,6,7]
2540; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
2541; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,5]
2542; SSE-NEXT:    packuswb %xmm0, %xmm6
2543; SSE-NEXT:    movdqa %xmm9, %xmm8
2544; SSE-NEXT:    pandn %xmm6, %xmm8
2545; SSE-NEXT:    pand %xmm9, %xmm1
2546; SSE-NEXT:    por %xmm1, %xmm8
2547; SSE-NEXT:    movdqa %xmm13, %xmm0
2548; SSE-NEXT:    pand %xmm13, %xmm15
2549; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2550; SSE-NEXT:    por %xmm15, %xmm0
2551; SSE-NEXT:    movdqa %xmm0, %xmm1
2552; SSE-NEXT:    pxor %xmm6, %xmm6
2553; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
2554; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
2555; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0]
2556; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2557; SSE-NEXT:    por %xmm7, %xmm2
2558; SSE-NEXT:    movdqa %xmm2, %xmm1
2559; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
2560; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15]
2561; SSE-NEXT:    pxor %xmm13, %xmm13
2562; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,0]
2563; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
2564; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
2565; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7]
2566; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
2567; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
2568; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6]
2569; SSE-NEXT:    packuswb %xmm2, %xmm1
2570; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,0,0,65535,65535,65535,65535,65535]
2571; SSE-NEXT:    pand %xmm2, %xmm1
2572; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,6,5]
2573; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[3,1,2,0]
2574; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,0,1,2,4,5,6,7]
2575; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,4,6,7]
2576; SSE-NEXT:    packuswb %xmm6, %xmm6
2577; SSE-NEXT:    pandn %xmm6, %xmm2
2578; SSE-NEXT:    por %xmm2, %xmm1
2579; SSE-NEXT:    movdqa %xmm4, %xmm2
2580; SSE-NEXT:    movdqa %xmm4, %xmm15
2581; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[3,0]
2582; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,2]
2583; SSE-NEXT:    pand %xmm9, %xmm1
2584; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7]
2585; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
2586; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,5]
2587; SSE-NEXT:    packuswb %xmm2, %xmm2
2588; SSE-NEXT:    pandn %xmm2, %xmm9
2589; SSE-NEXT:    por %xmm1, %xmm9
2590; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2591; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2592; SSE-NEXT:    movdqa %xmm0, %xmm1
2593; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15]
2594; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
2595; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
2596; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
2597; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
2598; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7]
2599; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
2600; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2601; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,0,4,5,6,7]
2602; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
2603; SSE-NEXT:    packuswb %xmm1, %xmm2
2604; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535]
2605; SSE-NEXT:    movdqa %xmm4, %xmm6
2606; SSE-NEXT:    pandn %xmm2, %xmm6
2607; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2608; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
2609; SSE-NEXT:    pand %xmm5, %xmm2
2610; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2611; SSE-NEXT:    movdqa %xmm2, %xmm0
2612; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
2613; SSE-NEXT:    movdqa %xmm3, %xmm11
2614; SSE-NEXT:    pandn %xmm0, %xmm11
2615; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15]
2616; SSE-NEXT:    pand %xmm3, %xmm2
2617; SSE-NEXT:    por %xmm11, %xmm2
2618; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,0,3,4,5,6,7]
2619; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4]
2620; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
2621; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,0,4,5,6,7]
2622; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
2623; SSE-NEXT:    packuswb %xmm2, %xmm2
2624; SSE-NEXT:    pand %xmm4, %xmm2
2625; SSE-NEXT:    por %xmm6, %xmm2
2626; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
2627; SSE-NEXT:    # xmm6 = mem[3,1,2,3]
2628; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
2629; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,1,2,1,4,5,6,7]
2630; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm10[0,1,0,3,4,5,6,7]
2631; SSE-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1]
2632; SSE-NEXT:    packuswb %xmm1, %xmm10
2633; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm10[2,1]
2634; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
2635; SSE-NEXT:    movdqa %xmm14, %xmm1
2636; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15]
2637; SSE-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
2638; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm14[0,1,1,3]
2639; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[0,3],xmm1[1,2]
2640; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
2641; SSE-NEXT:    pand %xmm5, %xmm0
2642; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
2643; SSE-NEXT:    por %xmm0, %xmm5
2644; SSE-NEXT:    movdqa %xmm5, %xmm1
2645; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7]
2646; SSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15]
2647; SSE-NEXT:    pand %xmm3, %xmm5
2648; SSE-NEXT:    pandn %xmm1, %xmm3
2649; SSE-NEXT:    por %xmm5, %xmm3
2650; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm3[3,1,0,3,4,5,6,7]
2651; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4]
2652; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
2653; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7]
2654; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
2655; SSE-NEXT:    packuswb %xmm1, %xmm1
2656; SSE-NEXT:    pand %xmm4, %xmm1
2657; SSE-NEXT:    movdqa %xmm4, %xmm7
2658; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[0,2,3,1]
2659; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm14[3,1,2,3,4,5,6,7]
2660; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
2661; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2662; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[2,1,3,0,4,5,6,7]
2663; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5,4,7]
2664; SSE-NEXT:    packuswb %xmm3, %xmm4
2665; SSE-NEXT:    pandn %xmm4, %xmm7
2666; SSE-NEXT:    por %xmm7, %xmm1
2667; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
2668; SSE-NEXT:    # xmm4 = mem[3,1,2,3]
2669; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm15[0,2,2,3]
2670; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7]
2671; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
2672; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
2673; SSE-NEXT:    packuswb %xmm3, %xmm5
2674; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,1]
2675; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2676; SSE-NEXT:    movaps %xmm0, 16(%rsi)
2677; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2678; SSE-NEXT:    movaps %xmm0, (%rsi)
2679; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2680; SSE-NEXT:    movaps %xmm0, 16(%rdx)
2681; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2682; SSE-NEXT:    movaps %xmm0, (%rdx)
2683; SSE-NEXT:    movdqa %xmm12, 16(%rcx)
2684; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2685; SSE-NEXT:    movaps %xmm0, (%rcx)
2686; SSE-NEXT:    movdqa %xmm9, 16(%r8)
2687; SSE-NEXT:    movdqa %xmm8, (%r8)
2688; SSE-NEXT:    movaps %xmm1, 16(%r9)
2689; SSE-NEXT:    movaps %xmm2, (%r9)
2690; SSE-NEXT:    addq $184, %rsp
2691; SSE-NEXT:    retq
2692;
2693; AVX-LABEL: load_i8_stride5_vf32:
2694; AVX:       # %bb.0:
2695; AVX-NEXT:    vmovdqa 144(%rdi), %xmm0
2696; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[1,6,11]
2697; AVX-NEXT:    vmovdqa 128(%rdi), %xmm1
2698; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
2699; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm5
2700; AVX-NEXT:    vmovdqa 112(%rdi), %xmm2
2701; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u],zero,zero,zero,xmm2[3,8,13,u,u,u,u,u,u]
2702; AVX-NEXT:    vmovdqa 96(%rdi), %xmm3
2703; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,4,9,14],zero,zero,zero,xmm3[u,u,u,u,u,u]
2704; AVX-NEXT:    vpor %xmm4, %xmm6, %xmm4
2705; AVX-NEXT:    vpxor %xmm6, %xmm6, %xmm6
2706; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3,4,5,6,7]
2707; AVX-NEXT:    vmovdqa 80(%rdi), %xmm4
2708; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm4[0,5,10,15],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u]
2709; AVX-NEXT:    vpor %xmm7, %xmm6, %xmm6
2710; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7]
2711; AVX-NEXT:    vmovdqa (%rdi), %xmm7
2712; AVX-NEXT:    vmovdqa 16(%rdi), %xmm9
2713; AVX-NEXT:    vmovdqa 32(%rdi), %xmm8
2714; AVX-NEXT:    vmovdqa 48(%rdi), %xmm10
2715; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
2716; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
2717; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm11[2],xmm5[2],xmm11[3],xmm5[3]
2718; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12,u,u,u]
2719; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm8[u,u,u]
2720; AVX-NEXT:    vpor %xmm11, %xmm12, %xmm11
2721; AVX-NEXT:    vmovq {{.*#+}} xmm13 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
2722; AVX-NEXT:    vpblendvb %xmm13, %xmm5, %xmm11, %xmm5
2723; AVX-NEXT:    vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
2724; AVX-NEXT:    vandps %ymm5, %ymm12, %ymm11
2725; AVX-NEXT:    vmovdqa 64(%rdi), %xmm5
2726; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11]
2727; AVX-NEXT:    vandnps %ymm14, %ymm12, %ymm14
2728; AVX-NEXT:    vorps %ymm14, %ymm11, %ymm11
2729; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm11, %ymm6
2730; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2731; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,7,12]
2732; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
2733; AVX-NEXT:    vpor %xmm11, %xmm14, %xmm11
2734; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u]
2735; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u]
2736; AVX-NEXT:    vpor %xmm14, %xmm15, %xmm14
2737; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[3,8,13,u,u,u]
2738; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,u,4,9,14],zero,zero,zero,xmm8[u,u,u]
2739; AVX-NEXT:    vpor %xmm6, %xmm15, %xmm6
2740; AVX-NEXT:    vpblendvb %xmm13, %xmm14, %xmm6, %xmm6
2741; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14,u,u,u,u,u,u]
2742; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,0,5,10,15],zero,zero,zero,xmm3[u,u,u,u,u,u]
2743; AVX-NEXT:    vpor %xmm13, %xmm14, %xmm14
2744; AVX-NEXT:    vmovdqa {{.*#+}} xmm13 = [128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u]
2745; AVX-NEXT:    vpshufb %xmm13, %xmm14, %xmm14
2746; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = xmm4[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u]
2747; AVX-NEXT:    vpor %xmm15, %xmm14, %xmm14
2748; AVX-NEXT:    vpblendw {{.*#+}} xmm11 = xmm14[0,1,2,3,4],xmm11[5,6,7]
2749; AVX-NEXT:    vandps %ymm6, %ymm12, %ymm6
2750; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12]
2751; AVX-NEXT:    vandnps %ymm14, %ymm12, %ymm14
2752; AVX-NEXT:    vorps %ymm6, %ymm14, %ymm6
2753; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm6, %ymm6
2754; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2755; AVX-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u]
2756; AVX-NEXT:    vpshufb %xmm6, %xmm9, %xmm14
2757; AVX-NEXT:    vpshufb %xmm6, %xmm10, %xmm6
2758; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3,4,5,6,7]
2759; AVX-NEXT:    vmovdqa {{.*#+}} xmm14 = [2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u]
2760; AVX-NEXT:    vpshufb %xmm14, %xmm7, %xmm15
2761; AVX-NEXT:    vpshufb %xmm14, %xmm8, %xmm14
2762; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4,5,6,7]
2763; AVX-NEXT:    vpor %xmm6, %xmm14, %xmm6
2764; AVX-NEXT:    vandps %ymm6, %ymm12, %ymm6
2765; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13]
2766; AVX-NEXT:    vandnps %ymm14, %ymm12, %ymm12
2767; AVX-NEXT:    vorps %ymm6, %ymm12, %ymm6
2768; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13]
2769; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
2770; AVX-NEXT:    vpor %xmm12, %xmm14, %xmm12
2771; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,1,6,11],zero,zero,zero,zero,xmm3[u,u,u,u,u,u]
2772; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = xmm2[u,u,u],zero,zero,zero,xmm2[0,5,10,15,u,u,u,u,u,u]
2773; AVX-NEXT:    vpor %xmm14, %xmm15, %xmm14
2774; AVX-NEXT:    vpshufb %xmm13, %xmm14, %xmm13
2775; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm4[2,7,12],zero,zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u]
2776; AVX-NEXT:    vpor %xmm14, %xmm13, %xmm13
2777; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3,4],xmm12[5,6,7]
2778; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm6, %ymm12
2779; AVX-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,128,2,7,12,1,6,11,128,128,128,128,u,u,u]
2780; AVX-NEXT:    vpshufb %xmm6, %xmm9, %xmm13
2781; AVX-NEXT:    vpshufb %xmm6, %xmm8, %xmm6
2782; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3,4,5,6,7]
2783; AVX-NEXT:    vmovdqa {{.*#+}} xmm13 = [3,8,13,128,128,128,128,128,128,0,5,10,15,u,u,u]
2784; AVX-NEXT:    vpshufb %xmm13, %xmm7, %xmm14
2785; AVX-NEXT:    vpshufb %xmm13, %xmm10, %xmm13
2786; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3,4,5,6,7]
2787; AVX-NEXT:    vpor %xmm6, %xmm13, %xmm6
2788; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,xmm2[1,6,11,u,u,u,u,u,u,u]
2789; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,2,7,12],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
2790; AVX-NEXT:    vpor %xmm13, %xmm14, %xmm14
2791; AVX-NEXT:    vmovdqa {{.*#+}} xmm13 = [128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u]
2792; AVX-NEXT:    vpshufb %xmm13, %xmm14, %xmm14
2793; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = xmm4[3,8,13],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u,u]
2794; AVX-NEXT:    vpor %xmm15, %xmm14, %xmm14
2795; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14]
2796; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm15, %ymm14
2797; AVX-NEXT:    vmovaps {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255]
2798; AVX-NEXT:    vandps %ymm6, %ymm15, %ymm6
2799; AVX-NEXT:    vandnps %ymm14, %ymm15, %ymm14
2800; AVX-NEXT:    vorps %ymm6, %ymm14, %ymm6
2801; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm0[4,9,14]
2802; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
2803; AVX-NEXT:    vpor %xmm14, %xmm15, %xmm14
2804; AVX-NEXT:    vextractf128 $1, %ymm6, %xmm15
2805; AVX-NEXT:    vpmovsxwq {{.*#+}} xmm11 = [18446744073709551615,255]
2806; AVX-NEXT:    vpblendvb %xmm11, %xmm15, %xmm14, %xmm14
2807; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm6, %ymm6
2808; AVX-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u]
2809; AVX-NEXT:    vpshufb %xmm14, %xmm10, %xmm10
2810; AVX-NEXT:    vpshufb %xmm14, %xmm9, %xmm9
2811; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4,5],xmm9[6,7]
2812; AVX-NEXT:    vmovdqa {{.*#+}} xmm10 = [4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u]
2813; AVX-NEXT:    vpshufb %xmm10, %xmm8, %xmm8
2814; AVX-NEXT:    vpshufb %xmm10, %xmm7, %xmm7
2815; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4,5],xmm7[6,7]
2816; AVX-NEXT:    vpor %xmm7, %xmm9, %xmm7
2817; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,xmm2[2,7,12,u,u,u,u,u,u,u]
2818; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,3,8,13],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
2819; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
2820; AVX-NEXT:    vpshufb %xmm13, %xmm2, %xmm2
2821; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm4[4,9,14],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u,u]
2822; AVX-NEXT:    vpor %xmm3, %xmm2, %xmm2
2823; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,1,6,11],zero,zero,zero,zero
2824; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[0,5,10,15]
2825; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
2826; AVX-NEXT:    vpblendvb %xmm11, %xmm2, %xmm0, %xmm0
2827; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
2828; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2829; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7]
2830; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2831; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2832; AVX-NEXT:    vmovaps %ymm1, (%rsi)
2833; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2834; AVX-NEXT:    vmovaps %ymm1, (%rdx)
2835; AVX-NEXT:    vmovaps %ymm12, (%rcx)
2836; AVX-NEXT:    vmovaps %ymm6, (%r8)
2837; AVX-NEXT:    vmovaps %ymm0, (%r9)
2838; AVX-NEXT:    vzeroupper
2839; AVX-NEXT:    retq
2840;
2841; AVX2-LABEL: load_i8_stride5_vf32:
2842; AVX2:       # %bb.0:
2843; AVX2-NEXT:    vmovdqa (%rdi), %ymm3
2844; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm4
2845; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm0
2846; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm1
2847; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
2848; AVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm4, %ymm5
2849; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
2850; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
2851; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
2852; AVX2-NEXT:    vpor %xmm6, %xmm5, %xmm5
2853; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
2854; AVX2-NEXT:    vpblendvb %ymm9, %ymm0, %ymm1, %ymm6
2855; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
2856; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255]
2857; AVX2-NEXT:    # ymm8 = mem[0,1,0,1]
2858; AVX2-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
2859; AVX2-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
2860; AVX2-NEXT:    vpmovsxwd {{.*#+}} ymm8 = [4294967295,4294967295,4294967295,255,0,0,0,0]
2861; AVX2-NEXT:    vmovdqa %xmm8, %xmm7
2862; AVX2-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm6
2863; AVX2-NEXT:    vpblendvb %ymm9, %ymm3, %ymm4, %ymm5
2864; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u]
2865; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm5
2866; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u]
2867; AVX2-NEXT:    vpor %xmm5, %xmm9, %xmm5
2868; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
2869; AVX2-NEXT:    vpblendvb %ymm9, %ymm1, %ymm0, %ymm10
2870; AVX2-NEXT:    vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
2871; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0]
2872; AVX2-NEXT:    # ymm12 = mem[0,1,0,1]
2873; AVX2-NEXT:    vpblendvb %ymm12, %ymm10, %ymm11, %ymm10
2874; AVX2-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
2875; AVX2-NEXT:    vpblendvb %ymm7, %ymm5, %ymm10, %ymm5
2876; AVX2-NEXT:    vpblendvb %ymm9, %ymm4, %ymm3, %ymm9
2877; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm10
2878; AVX2-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u]
2879; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u]
2880; AVX2-NEXT:    vpor %xmm10, %xmm9, %xmm9
2881; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
2882; AVX2-NEXT:    vpblendvb %ymm10, %ymm1, %ymm0, %ymm11
2883; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
2884; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255]
2885; AVX2-NEXT:    # ymm13 = mem[0,1,0,1]
2886; AVX2-NEXT:    vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
2887; AVX2-NEXT:    vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
2888; AVX2-NEXT:    vpblendvb %ymm7, %ymm9, %ymm11, %ymm7
2889; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
2890; AVX2-NEXT:    vpblendvb %ymm9, %ymm1, %ymm0, %ymm11
2891; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
2892; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0]
2893; AVX2-NEXT:    # ymm13 = mem[0,1,0,1]
2894; AVX2-NEXT:    vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
2895; AVX2-NEXT:    vpblendvb %ymm10, %ymm4, %ymm3, %ymm10
2896; AVX2-NEXT:    vpshufb {{.*#+}} xmm12 = xmm10[3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u]
2897; AVX2-NEXT:    vextracti128 $1, %ymm10, %xmm10
2898; AVX2-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15,u,u,u]
2899; AVX2-NEXT:    vpor %xmm12, %xmm10, %xmm10
2900; AVX2-NEXT:    vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
2901; AVX2-NEXT:    vpblendvb %ymm8, %ymm10, %ymm11, %ymm10
2902; AVX2-NEXT:    vmovdqa 144(%rdi), %xmm8
2903; AVX2-NEXT:    vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[1,6,11]
2904; AVX2-NEXT:    vpblendvb %ymm9, %ymm4, %ymm3, %ymm9
2905; AVX2-NEXT:    vmovdqa 128(%rdi), %xmm3
2906; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
2907; AVX2-NEXT:    vpor %xmm4, %xmm11, %xmm4
2908; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
2909; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7],ymm6[8,9,10,11,12],ymm4[13,14,15]
2910; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
2911; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2912; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2913; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255]
2914; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
2915; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2916; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[4,9,14]
2917; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
2918; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
2919; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
2920; AVX2-NEXT:    vpmovsxwq {{.*#+}} ymm2 = [18446744073709551615,18446744073709551615,18446744073709551615,255]
2921; AVX2-NEXT:    vpblendvb %ymm2, %ymm10, %ymm1, %ymm1
2922; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm6
2923; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[1,6,11,u,u,u,u]
2924; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12],zero,zero,zero,xmm9[u,u,u,u]
2925; AVX2-NEXT:    vpor %xmm6, %xmm9, %xmm6
2926; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm9
2927; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
2928; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7]
2929; AVX2-NEXT:    vpshufb {{.*#+}} ymm6 = ymm9[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
2930; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [0,5,0,5,0,5,0,5]
2931; AVX2-NEXT:    vpermd %ymm6, %ymm9, %ymm6
2932; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm6, %ymm0
2933; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[2,7,12]
2934; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
2935; AVX2-NEXT:    vpor %xmm2, %xmm6, %xmm2
2936; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
2937; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7],ymm5[8,9,10,11,12],ymm2[13,14,15]
2938; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
2939; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[3,8,13]
2940; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
2941; AVX2-NEXT:    vpor %xmm5, %xmm3, %xmm3
2942; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
2943; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7],ymm7[8,9,10,11,12],ymm3[13,14,15]
2944; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7]
2945; AVX2-NEXT:    vmovdqa %ymm4, (%rsi)
2946; AVX2-NEXT:    vmovdqa %ymm2, (%rdx)
2947; AVX2-NEXT:    vmovdqa %ymm3, (%rcx)
2948; AVX2-NEXT:    vmovdqa %ymm1, (%r8)
2949; AVX2-NEXT:    vmovdqa %ymm0, (%r9)
2950; AVX2-NEXT:    vzeroupper
2951; AVX2-NEXT:    retq
2952;
2953; AVX2-FP-LABEL: load_i8_stride5_vf32:
2954; AVX2-FP:       # %bb.0:
2955; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm3
2956; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm4
2957; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm0
2958; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm1
2959; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
2960; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm3, %ymm4, %ymm5
2961; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm6
2962; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
2963; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
2964; AVX2-FP-NEXT:    vpor %xmm6, %xmm5, %xmm5
2965; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
2966; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm0, %ymm1, %ymm6
2967; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
2968; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255]
2969; AVX2-FP-NEXT:    # ymm8 = mem[0,1,0,1]
2970; AVX2-FP-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
2971; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
2972; AVX2-FP-NEXT:    vpmovsxwd {{.*#+}} ymm8 = [4294967295,4294967295,4294967295,255,0,0,0,0]
2973; AVX2-FP-NEXT:    vmovdqa %xmm8, %xmm7
2974; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm6
2975; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm3, %ymm4, %ymm5
2976; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u]
2977; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm5
2978; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u]
2979; AVX2-FP-NEXT:    vpor %xmm5, %xmm9, %xmm5
2980; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
2981; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm1, %ymm0, %ymm10
2982; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
2983; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0]
2984; AVX2-FP-NEXT:    # ymm12 = mem[0,1,0,1]
2985; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm10, %ymm11, %ymm10
2986; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
2987; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm5, %ymm10, %ymm5
2988; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm4, %ymm3, %ymm9
2989; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm10
2990; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u]
2991; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u]
2992; AVX2-FP-NEXT:    vpor %xmm10, %xmm9, %xmm9
2993; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
2994; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm1, %ymm0, %ymm11
2995; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
2996; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255]
2997; AVX2-FP-NEXT:    # ymm13 = mem[0,1,0,1]
2998; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
2999; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
3000; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm9, %ymm11, %ymm7
3001; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
3002; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm1, %ymm0, %ymm11
3003; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
3004; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0]
3005; AVX2-FP-NEXT:    # ymm13 = mem[0,1,0,1]
3006; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
3007; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm4, %ymm3, %ymm10
3008; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm10[3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u]
3009; AVX2-FP-NEXT:    vextracti128 $1, %ymm10, %xmm10
3010; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15,u,u,u]
3011; AVX2-FP-NEXT:    vpor %xmm12, %xmm10, %xmm10
3012; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
3013; AVX2-FP-NEXT:    vpblendvb %ymm8, %ymm10, %ymm11, %ymm10
3014; AVX2-FP-NEXT:    vmovdqa 144(%rdi), %xmm8
3015; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[1,6,11]
3016; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm4, %ymm3, %ymm9
3017; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %xmm3
3018; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
3019; AVX2-FP-NEXT:    vpor %xmm4, %xmm11, %xmm4
3020; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
3021; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7],ymm6[8,9,10,11,12],ymm4[13,14,15]
3022; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
3023; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
3024; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3025; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255]
3026; AVX2-FP-NEXT:    # ymm2 = mem[0,1,0,1]
3027; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
3028; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[4,9,14]
3029; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
3030; AVX2-FP-NEXT:    vpor %xmm1, %xmm2, %xmm1
3031; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
3032; AVX2-FP-NEXT:    vpmovsxwq {{.*#+}} ymm2 = [18446744073709551615,18446744073709551615,18446744073709551615,255]
3033; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm10, %ymm1, %ymm1
3034; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm6
3035; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[1,6,11,u,u,u,u]
3036; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12],zero,zero,zero,xmm9[u,u,u,u]
3037; AVX2-FP-NEXT:    vpor %xmm6, %xmm9, %xmm6
3038; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm9
3039; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
3040; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7]
3041; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm9[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
3042; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [0,5,0,5,0,5,0,5]
3043; AVX2-FP-NEXT:    vpermd %ymm6, %ymm9, %ymm6
3044; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm6, %ymm0
3045; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[2,7,12]
3046; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
3047; AVX2-FP-NEXT:    vpor %xmm2, %xmm6, %xmm2
3048; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
3049; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7],ymm5[8,9,10,11,12],ymm2[13,14,15]
3050; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
3051; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[3,8,13]
3052; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
3053; AVX2-FP-NEXT:    vpor %xmm5, %xmm3, %xmm3
3054; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
3055; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7],ymm7[8,9,10,11,12],ymm3[13,14,15]
3056; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7]
3057; AVX2-FP-NEXT:    vmovdqa %ymm4, (%rsi)
3058; AVX2-FP-NEXT:    vmovdqa %ymm2, (%rdx)
3059; AVX2-FP-NEXT:    vmovdqa %ymm3, (%rcx)
3060; AVX2-FP-NEXT:    vmovdqa %ymm1, (%r8)
3061; AVX2-FP-NEXT:    vmovdqa %ymm0, (%r9)
3062; AVX2-FP-NEXT:    vzeroupper
3063; AVX2-FP-NEXT:    retq
3064;
3065; AVX2-FCP-LABEL: load_i8_stride5_vf32:
3066; AVX2-FCP:       # %bb.0:
3067; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm3
3068; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm4
3069; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
3070; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
3071; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
3072; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm3, %ymm4, %ymm5
3073; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
3074; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
3075; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
3076; AVX2-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
3077; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
3078; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm0, %ymm1, %ymm6
3079; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
3080; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255]
3081; AVX2-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
3082; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
3083; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
3084; AVX2-FCP-NEXT:    vpmovsxwd {{.*#+}} ymm8 = [4294967295,4294967295,4294967295,255,0,0,0,0]
3085; AVX2-FCP-NEXT:    vmovdqa %xmm8, %xmm7
3086; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm6
3087; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm3, %ymm4, %ymm5
3088; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u]
3089; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
3090; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u]
3091; AVX2-FCP-NEXT:    vpor %xmm5, %xmm9, %xmm5
3092; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
3093; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm1, %ymm0, %ymm10
3094; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
3095; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0]
3096; AVX2-FCP-NEXT:    # ymm12 = mem[0,1,0,1]
3097; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm10, %ymm11, %ymm10
3098; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
3099; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm5, %ymm10, %ymm5
3100; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm4, %ymm3, %ymm9
3101; AVX2-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
3102; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u]
3103; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u]
3104; AVX2-FCP-NEXT:    vpor %xmm10, %xmm9, %xmm9
3105; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
3106; AVX2-FCP-NEXT:    vpblendvb %ymm10, %ymm1, %ymm0, %ymm11
3107; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
3108; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255]
3109; AVX2-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
3110; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
3111; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
3112; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm9, %ymm11, %ymm7
3113; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
3114; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm1, %ymm0, %ymm11
3115; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
3116; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0]
3117; AVX2-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
3118; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
3119; AVX2-FCP-NEXT:    vpblendvb %ymm10, %ymm4, %ymm3, %ymm10
3120; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm10[3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u]
3121; AVX2-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm10
3122; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15,u,u,u]
3123; AVX2-FCP-NEXT:    vpor %xmm12, %xmm10, %xmm10
3124; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
3125; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm10, %ymm11, %ymm10
3126; AVX2-FCP-NEXT:    vmovdqa 144(%rdi), %xmm8
3127; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[1,6,11]
3128; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm4, %ymm3, %ymm9
3129; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %xmm3
3130; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
3131; AVX2-FCP-NEXT:    vpor %xmm4, %xmm11, %xmm4
3132; AVX2-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
3133; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7],ymm6[8,9,10,11,12],ymm4[13,14,15]
3134; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
3135; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
3136; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3137; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255]
3138; AVX2-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
3139; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
3140; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[4,9,14]
3141; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
3142; AVX2-FCP-NEXT:    vpor %xmm1, %xmm2, %xmm1
3143; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
3144; AVX2-FCP-NEXT:    vpmovsxwq {{.*#+}} ymm2 = [18446744073709551615,18446744073709551615,18446744073709551615,255]
3145; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm10, %ymm1, %ymm1
3146; AVX2-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm6
3147; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[1,6,11,u,u,u,u]
3148; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12],zero,zero,zero,xmm9[u,u,u,u]
3149; AVX2-FCP-NEXT:    vpor %xmm6, %xmm9, %xmm6
3150; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm9
3151; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
3152; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7]
3153; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm9[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
3154; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [0,5,0,5,0,5,0,5]
3155; AVX2-FCP-NEXT:    vpermd %ymm6, %ymm9, %ymm6
3156; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm6, %ymm0
3157; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[2,7,12]
3158; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
3159; AVX2-FCP-NEXT:    vpor %xmm2, %xmm6, %xmm2
3160; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
3161; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7],ymm5[8,9,10,11,12],ymm2[13,14,15]
3162; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
3163; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[3,8,13]
3164; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
3165; AVX2-FCP-NEXT:    vpor %xmm5, %xmm3, %xmm3
3166; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
3167; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7],ymm7[8,9,10,11,12],ymm3[13,14,15]
3168; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7]
3169; AVX2-FCP-NEXT:    vmovdqa %ymm4, (%rsi)
3170; AVX2-FCP-NEXT:    vmovdqa %ymm2, (%rdx)
3171; AVX2-FCP-NEXT:    vmovdqa %ymm3, (%rcx)
3172; AVX2-FCP-NEXT:    vmovdqa %ymm1, (%r8)
3173; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%r9)
3174; AVX2-FCP-NEXT:    vzeroupper
3175; AVX2-FCP-NEXT:    retq
3176;
3177; AVX512-LABEL: load_i8_stride5_vf32:
3178; AVX512:       # %bb.0:
3179; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
3180; AVX512-NEXT:    vmovdqa (%rdi), %ymm3
3181; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm5
3182; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm0
3183; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm1
3184; AVX512-NEXT:    vmovdqa %ymm2, %ymm4
3185; AVX512-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1))
3186; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1]
3187; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4))
3188; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128]
3189; AVX512-NEXT:    vpshufb %ymm8, %ymm6, %ymm6
3190; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
3191; AVX512-NEXT:    vmovdqa %ymm4, %ymm7
3192; AVX512-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm3 ^ ymm5))
3193; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm9
3194; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u]
3195; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u]
3196; AVX512-NEXT:    vpor %xmm7, %xmm9, %xmm9
3197; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
3198; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = (ymm9 & ymm11) | ymm6
3199; AVX512-NEXT:    vmovdqa 144(%rdi), %xmm7
3200; AVX512-NEXT:    vpshufb %xmm8, %xmm7, %xmm6
3201; AVX512-NEXT:    vmovdqa 128(%rdi), %xmm8
3202; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
3203; AVX512-NEXT:    vpor %xmm6, %xmm10, %xmm6
3204; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
3205; AVX512-NEXT:    vpblendw {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6,7],ymm9[8,9,10,11,12],ymm6[13,14,15]
3206; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
3207; AVX512-NEXT:    vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
3208; AVX512-NEXT:    vmovdqa %ymm10, %ymm9
3209; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm0 ^ (ymm9 & (ymm1 ^ ymm0))
3210; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1]
3211; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (mem & (ymm12 ^ ymm9))
3212; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128]
3213; AVX512-NEXT:    vpshufb %ymm9, %ymm12, %ymm12
3214; AVX512-NEXT:    vmovdqa %ymm2, %ymm13
3215; AVX512-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm5 ^ (ymm13 & (ymm3 ^ ymm5))
3216; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u]
3217; AVX512-NEXT:    vextracti128 $1, %ymm13, %xmm13
3218; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u]
3219; AVX512-NEXT:    vpor %xmm14, %xmm13, %xmm13
3220; AVX512-NEXT:    vpternlogq {{.*#+}} ymm13 = (ymm13 & ymm11) | ymm12
3221; AVX512-NEXT:    vpshufb %xmm9, %xmm7, %xmm9
3222; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
3223; AVX512-NEXT:    vpor %xmm9, %xmm12, %xmm9
3224; AVX512-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
3225; AVX512-NEXT:    vpblendw {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5,6,7],ymm13[8,9,10,11,12],ymm9[13,14,15]
3226; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
3227; AVX512-NEXT:    vmovdqa %ymm4, %ymm12
3228; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm0 ^ (ymm12 & (ymm1 ^ ymm0))
3229; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
3230; AVX512-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (mem & (ymm13 ^ ymm12))
3231; AVX512-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128]
3232; AVX512-NEXT:    vpshufb %ymm12, %ymm13, %ymm13
3233; AVX512-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm3 ^ (ymm10 & (ymm5 ^ ymm3))
3234; AVX512-NEXT:    vextracti128 $1, %ymm10, %xmm14
3235; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14,u,u,u]
3236; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15],zero,zero,zero,xmm10[u,u,u]
3237; AVX512-NEXT:    vpor %xmm14, %xmm10, %xmm10
3238; AVX512-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm13
3239; AVX512-NEXT:    vpshufb %xmm12, %xmm7, %xmm11
3240; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
3241; AVX512-NEXT:    vpor %xmm11, %xmm12, %xmm11
3242; AVX512-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
3243; AVX512-NEXT:    vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15]
3244; AVX512-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
3245; AVX512-NEXT:    vmovdqa %ymm2, %ymm11
3246; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm0 ^ (ymm11 & (ymm1 ^ ymm0))
3247; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
3248; AVX512-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (mem & (ymm12 ^ ymm11))
3249; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
3250; AVX512-NEXT:    vpshufb %ymm11, %ymm12, %ymm12
3251; AVX512-NEXT:    vmovdqa %ymm4, %ymm13
3252; AVX512-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm5 ^ ymm3))
3253; AVX512-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[3,8,13],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[u,u,u]
3254; AVX512-NEXT:    vextracti128 $1, %ymm13, %xmm13
3255; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[2,7,12],zero,zero,zero,xmm13[0,5,10,15,u,u,u]
3256; AVX512-NEXT:    vpor %xmm14, %xmm13, %xmm13
3257; AVX512-NEXT:    vpternlogq {{.*#+}} ymm13 = (ymm13 & mem) | ymm12
3258; AVX512-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
3259; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
3260; AVX512-NEXT:    vpor %xmm7, %xmm8, %xmm7
3261; AVX512-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
3262; AVX512-NEXT:    vpmovsxwq {{.*#+}} ymm8 = [18446744073709551615,18446744073709551615,18446744073709551615,255]
3263; AVX512-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm8 & (ymm7 ^ ymm13))
3264; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm3 ^ (ymm2 & (ymm5 ^ ymm3))
3265; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm3
3266; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
3267; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
3268; AVX512-NEXT:    vpor %xmm3, %xmm2, %xmm2
3269; AVX512-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1))
3270; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm4[2,3,0,1]
3271; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm4))
3272; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
3273; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
3274; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm1
3275; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
3276; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5]
3277; AVX512-NEXT:    vpermd %ymm1, %ymm2, %ymm1
3278; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm8 & (ymm1 ^ ymm0))
3279; AVX512-NEXT:    vmovdqa %ymm6, (%rsi)
3280; AVX512-NEXT:    vmovdqa %ymm9, (%rdx)
3281; AVX512-NEXT:    vmovdqa %ymm10, (%rcx)
3282; AVX512-NEXT:    vmovdqa %ymm7, (%r8)
3283; AVX512-NEXT:    vmovdqa %ymm1, (%r9)
3284; AVX512-NEXT:    vzeroupper
3285; AVX512-NEXT:    retq
3286;
3287; AVX512-FCP-LABEL: load_i8_stride5_vf32:
3288; AVX512-FCP:       # %bb.0:
3289; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
3290; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm3
3291; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
3292; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
3293; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
3294; AVX512-FCP-NEXT:    vmovdqa %ymm2, %ymm4
3295; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1))
3296; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1]
3297; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4))
3298; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128]
3299; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm6, %ymm6
3300; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
3301; AVX512-FCP-NEXT:    vmovdqa %ymm4, %ymm7
3302; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm3 ^ ymm5))
3303; AVX512-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm9
3304; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u]
3305; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u]
3306; AVX512-FCP-NEXT:    vpor %xmm7, %xmm9, %xmm9
3307; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
3308; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = (ymm9 & ymm11) | ymm6
3309; AVX512-FCP-NEXT:    vmovdqa 144(%rdi), %xmm7
3310; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm6
3311; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %xmm8
3312; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
3313; AVX512-FCP-NEXT:    vpor %xmm6, %xmm10, %xmm6
3314; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
3315; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6,7],ymm9[8,9,10,11,12],ymm6[13,14,15]
3316; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
3317; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
3318; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm9
3319; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm0 ^ (ymm9 & (ymm1 ^ ymm0))
3320; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1]
3321; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (mem & (ymm12 ^ ymm9))
3322; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128]
3323; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm12, %ymm12
3324; AVX512-FCP-NEXT:    vmovdqa %ymm2, %ymm13
3325; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm5 ^ (ymm13 & (ymm3 ^ ymm5))
3326; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u]
3327; AVX512-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
3328; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u]
3329; AVX512-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm13
3330; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = (ymm13 & ymm11) | ymm12
3331; AVX512-FCP-NEXT:    vpshufb %xmm9, %xmm7, %xmm9
3332; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
3333; AVX512-FCP-NEXT:    vpor %xmm9, %xmm12, %xmm9
3334; AVX512-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
3335; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5,6,7],ymm13[8,9,10,11,12],ymm9[13,14,15]
3336; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
3337; AVX512-FCP-NEXT:    vmovdqa %ymm4, %ymm12
3338; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm0 ^ (ymm12 & (ymm1 ^ ymm0))
3339; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
3340; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (mem & (ymm13 ^ ymm12))
3341; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128]
3342; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm13, %ymm13
3343; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm3 ^ (ymm10 & (ymm5 ^ ymm3))
3344; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm14
3345; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14,u,u,u]
3346; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15],zero,zero,zero,xmm10[u,u,u]
3347; AVX512-FCP-NEXT:    vpor %xmm14, %xmm10, %xmm10
3348; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm13
3349; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm7, %xmm11
3350; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
3351; AVX512-FCP-NEXT:    vpor %xmm11, %xmm12, %xmm11
3352; AVX512-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
3353; AVX512-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15]
3354; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
3355; AVX512-FCP-NEXT:    vmovdqa %ymm2, %ymm11
3356; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm0 ^ (ymm11 & (ymm1 ^ ymm0))
3357; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
3358; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (mem & (ymm12 ^ ymm11))
3359; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
3360; AVX512-FCP-NEXT:    vpshufb %ymm11, %ymm12, %ymm12
3361; AVX512-FCP-NEXT:    vmovdqa %ymm4, %ymm13
3362; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm5 ^ ymm3))
3363; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[3,8,13],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[u,u,u]
3364; AVX512-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
3365; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[2,7,12],zero,zero,zero,xmm13[0,5,10,15,u,u,u]
3366; AVX512-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm13
3367; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = (ymm13 & mem) | ymm12
3368; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
3369; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
3370; AVX512-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
3371; AVX512-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
3372; AVX512-FCP-NEXT:    vpmovsxwq {{.*#+}} ymm8 = [18446744073709551615,18446744073709551615,18446744073709551615,255]
3373; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm8 & (ymm7 ^ ymm13))
3374; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm3 ^ (ymm2 & (ymm5 ^ ymm3))
3375; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
3376; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
3377; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
3378; AVX512-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
3379; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1))
3380; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm4[2,3,0,1]
3381; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm4))
3382; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
3383; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
3384; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
3385; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
3386; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5]
3387; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm2, %ymm1
3388; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm8 & (ymm1 ^ ymm0))
3389; AVX512-FCP-NEXT:    vmovdqa %ymm6, (%rsi)
3390; AVX512-FCP-NEXT:    vmovdqa %ymm9, (%rdx)
3391; AVX512-FCP-NEXT:    vmovdqa %ymm10, (%rcx)
3392; AVX512-FCP-NEXT:    vmovdqa %ymm7, (%r8)
3393; AVX512-FCP-NEXT:    vmovdqa %ymm1, (%r9)
3394; AVX512-FCP-NEXT:    vzeroupper
3395; AVX512-FCP-NEXT:    retq
3396;
3397; AVX512DQ-LABEL: load_i8_stride5_vf32:
3398; AVX512DQ:       # %bb.0:
3399; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
3400; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm3
3401; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm5
3402; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm0
3403; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %ymm1
3404; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm4
3405; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1))
3406; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1]
3407; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4))
3408; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128]
3409; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm6, %ymm6
3410; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
3411; AVX512DQ-NEXT:    vmovdqa %ymm4, %ymm7
3412; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm3 ^ ymm5))
3413; AVX512DQ-NEXT:    vextracti128 $1, %ymm7, %xmm9
3414; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u]
3415; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u]
3416; AVX512DQ-NEXT:    vpor %xmm7, %xmm9, %xmm9
3417; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
3418; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = (ymm9 & ymm11) | ymm6
3419; AVX512DQ-NEXT:    vmovdqa 144(%rdi), %xmm7
3420; AVX512DQ-NEXT:    vpshufb %xmm8, %xmm7, %xmm6
3421; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %xmm8
3422; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
3423; AVX512DQ-NEXT:    vpor %xmm6, %xmm10, %xmm6
3424; AVX512DQ-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
3425; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6,7],ymm9[8,9,10,11,12],ymm6[13,14,15]
3426; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
3427; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
3428; AVX512DQ-NEXT:    vmovdqa %ymm10, %ymm9
3429; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm0 ^ (ymm9 & (ymm1 ^ ymm0))
3430; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1]
3431; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (mem & (ymm12 ^ ymm9))
3432; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128]
3433; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm12, %ymm12
3434; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm13
3435; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm5 ^ (ymm13 & (ymm3 ^ ymm5))
3436; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u]
3437; AVX512DQ-NEXT:    vextracti128 $1, %ymm13, %xmm13
3438; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u]
3439; AVX512DQ-NEXT:    vpor %xmm14, %xmm13, %xmm13
3440; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm13 = (ymm13 & ymm11) | ymm12
3441; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm7, %xmm9
3442; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
3443; AVX512DQ-NEXT:    vpor %xmm9, %xmm12, %xmm9
3444; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
3445; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5,6,7],ymm13[8,9,10,11,12],ymm9[13,14,15]
3446; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
3447; AVX512DQ-NEXT:    vmovdqa %ymm4, %ymm12
3448; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm0 ^ (ymm12 & (ymm1 ^ ymm0))
3449; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
3450; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (mem & (ymm13 ^ ymm12))
3451; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128]
3452; AVX512DQ-NEXT:    vpshufb %ymm12, %ymm13, %ymm13
3453; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm3 ^ (ymm10 & (ymm5 ^ ymm3))
3454; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm14
3455; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14,u,u,u]
3456; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15],zero,zero,zero,xmm10[u,u,u]
3457; AVX512DQ-NEXT:    vpor %xmm14, %xmm10, %xmm10
3458; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm13
3459; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm7, %xmm11
3460; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
3461; AVX512DQ-NEXT:    vpor %xmm11, %xmm12, %xmm11
3462; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
3463; AVX512DQ-NEXT:    vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15]
3464; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
3465; AVX512DQ-NEXT:    vmovdqa %ymm2, %ymm11
3466; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm0 ^ (ymm11 & (ymm1 ^ ymm0))
3467; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
3468; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (mem & (ymm12 ^ ymm11))
3469; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
3470; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm12, %ymm12
3471; AVX512DQ-NEXT:    vmovdqa %ymm4, %ymm13
3472; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm5 ^ ymm3))
3473; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[3,8,13],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[u,u,u]
3474; AVX512DQ-NEXT:    vextracti128 $1, %ymm13, %xmm13
3475; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[2,7,12],zero,zero,zero,xmm13[0,5,10,15,u,u,u]
3476; AVX512DQ-NEXT:    vpor %xmm14, %xmm13, %xmm13
3477; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm13 = (ymm13 & mem) | ymm12
3478; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
3479; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
3480; AVX512DQ-NEXT:    vpor %xmm7, %xmm8, %xmm7
3481; AVX512DQ-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
3482; AVX512DQ-NEXT:    vpmovsxwq {{.*#+}} ymm8 = [18446744073709551615,18446744073709551615,18446744073709551615,255]
3483; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm8 & (ymm7 ^ ymm13))
3484; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm3 ^ (ymm2 & (ymm5 ^ ymm3))
3485; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
3486; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
3487; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
3488; AVX512DQ-NEXT:    vpor %xmm3, %xmm2, %xmm2
3489; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1))
3490; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm4[2,3,0,1]
3491; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm4))
3492; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
3493; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
3494; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm1
3495; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
3496; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5]
3497; AVX512DQ-NEXT:    vpermd %ymm1, %ymm2, %ymm1
3498; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm8 & (ymm1 ^ ymm0))
3499; AVX512DQ-NEXT:    vmovdqa %ymm6, (%rsi)
3500; AVX512DQ-NEXT:    vmovdqa %ymm9, (%rdx)
3501; AVX512DQ-NEXT:    vmovdqa %ymm10, (%rcx)
3502; AVX512DQ-NEXT:    vmovdqa %ymm7, (%r8)
3503; AVX512DQ-NEXT:    vmovdqa %ymm1, (%r9)
3504; AVX512DQ-NEXT:    vzeroupper
3505; AVX512DQ-NEXT:    retq
3506;
3507; AVX512DQ-FCP-LABEL: load_i8_stride5_vf32:
3508; AVX512DQ-FCP:       # %bb.0:
3509; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
3510; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm3
3511; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
3512; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
3513; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
3514; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm4
3515; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1))
3516; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1]
3517; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4))
3518; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128]
3519; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm6, %ymm6
3520; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
3521; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, %ymm7
3522; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm3 ^ ymm5))
3523; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm9
3524; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u]
3525; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u]
3526; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm9, %xmm9
3527; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
3528; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = (ymm9 & ymm11) | ymm6
3529; AVX512DQ-FCP-NEXT:    vmovdqa 144(%rdi), %xmm7
3530; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm6
3531; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %xmm8
3532; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
3533; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm10, %xmm6
3534; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
3535; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6,7],ymm9[8,9,10,11,12],ymm6[13,14,15]
3536; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
3537; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
3538; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm9
3539; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm0 ^ (ymm9 & (ymm1 ^ ymm0))
3540; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1]
3541; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (mem & (ymm12 ^ ymm9))
3542; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128]
3543; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm12, %ymm12
3544; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm13
3545; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm5 ^ (ymm13 & (ymm3 ^ ymm5))
3546; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u]
3547; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
3548; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u]
3549; AVX512DQ-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm13
3550; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = (ymm13 & ymm11) | ymm12
3551; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm7, %xmm9
3552; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
3553; AVX512DQ-FCP-NEXT:    vpor %xmm9, %xmm12, %xmm9
3554; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
3555; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5,6,7],ymm13[8,9,10,11,12],ymm9[13,14,15]
3556; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
3557; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, %ymm12
3558; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm0 ^ (ymm12 & (ymm1 ^ ymm0))
3559; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
3560; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm13 ^ (mem & (ymm13 ^ ymm12))
3561; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128]
3562; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm13, %ymm13
3563; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm3 ^ (ymm10 & (ymm5 ^ ymm3))
3564; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm14
3565; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14,u,u,u]
3566; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15],zero,zero,zero,xmm10[u,u,u]
3567; AVX512DQ-FCP-NEXT:    vpor %xmm14, %xmm10, %xmm10
3568; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm13
3569; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm7, %xmm11
3570; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
3571; AVX512DQ-FCP-NEXT:    vpor %xmm11, %xmm12, %xmm11
3572; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
3573; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15]
3574; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
3575; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm11
3576; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm0 ^ (ymm11 & (ymm1 ^ ymm0))
3577; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
3578; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm12 = ymm12 ^ (mem & (ymm12 ^ ymm11))
3579; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
3580; AVX512DQ-FCP-NEXT:    vpshufb %ymm11, %ymm12, %ymm12
3581; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, %ymm13
3582; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm5 ^ ymm3))
3583; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[3,8,13],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[u,u,u]
3584; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
3585; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[2,7,12],zero,zero,zero,xmm13[0,5,10,15,u,u,u]
3586; AVX512DQ-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm13
3587; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm13 = (ymm13 & mem) | ymm12
3588; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
3589; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
3590; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
3591; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
3592; AVX512DQ-FCP-NEXT:    vpmovsxwq {{.*#+}} ymm8 = [18446744073709551615,18446744073709551615,18446744073709551615,255]
3593; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm8 & (ymm7 ^ ymm13))
3594; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm3 ^ (ymm2 & (ymm5 ^ ymm3))
3595; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
3596; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
3597; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
3598; AVX512DQ-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
3599; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1))
3600; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm4[2,3,0,1]
3601; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm4))
3602; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
3603; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
3604; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
3605; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
3606; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5]
3607; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm2, %ymm1
3608; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm8 & (ymm1 ^ ymm0))
3609; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, (%rsi)
3610; AVX512DQ-FCP-NEXT:    vmovdqa %ymm9, (%rdx)
3611; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, (%rcx)
3612; AVX512DQ-FCP-NEXT:    vmovdqa %ymm7, (%r8)
3613; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, (%r9)
3614; AVX512DQ-FCP-NEXT:    vzeroupper
3615; AVX512DQ-FCP-NEXT:    retq
3616;
3617; AVX512BW-LABEL: load_i8_stride5_vf32:
3618; AVX512BW:       # %bb.0:
3619; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm3
3620; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm2
3621; AVX512BW-NEXT:    vmovdqa 64(%rdi), %ymm0
3622; AVX512BW-NEXT:    vmovdqa 96(%rdi), %ymm1
3623; AVX512BW-NEXT:    movw $21140, %ax # imm = 0x5294
3624; AVX512BW-NEXT:    kmovd %eax, %k1
3625; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm0, %ymm4 {%k1}
3626; AVX512BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
3627; AVX512BW-NEXT:    movl $1108344832, %eax # imm = 0x42100000
3628; AVX512BW-NEXT:    kmovd %eax, %k2
3629; AVX512BW-NEXT:    vmovdqu8 %ymm5, %ymm4 {%k2}
3630; AVX512BW-NEXT:    movw $19026, %ax # imm = 0x4A52
3631; AVX512BW-NEXT:    kmovd %eax, %k2
3632; AVX512BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm5 {%k2}
3633; AVX512BW-NEXT:    vextracti128 $1, %ymm5, %xmm6
3634; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
3635; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
3636; AVX512BW-NEXT:    vpor %xmm6, %xmm5, %xmm5
3637; AVX512BW-NEXT:    movl $67100672, %eax # imm = 0x3FFE000
3638; AVX512BW-NEXT:    kmovd %eax, %k3
3639; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm5 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
3640; AVX512BW-NEXT:    vmovdqa 144(%rdi), %xmm6
3641; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11]
3642; AVX512BW-NEXT:    vmovdqa 128(%rdi), %xmm7
3643; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
3644; AVX512BW-NEXT:    vpor %xmm4, %xmm8, %xmm4
3645; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
3646; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7],ymm5[8,9,10,11,12],ymm4[13,14,15]
3647; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
3648; AVX512BW-NEXT:    movw $10570, %ax # imm = 0x294A
3649; AVX512BW-NEXT:    kmovd %eax, %k4
3650; AVX512BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm5 {%k4}
3651; AVX512BW-NEXT:    vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1]
3652; AVX512BW-NEXT:    movl $-2078212096, %eax # imm = 0x84210000
3653; AVX512BW-NEXT:    kmovd %eax, %k5
3654; AVX512BW-NEXT:    vmovdqu8 %ymm8, %ymm5 {%k5}
3655; AVX512BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm8 {%k1}
3656; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm8[1,6,11],zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[u,u,u]
3657; AVX512BW-NEXT:    vextracti128 $1, %ymm8, %xmm8
3658; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13,u,u,u]
3659; AVX512BW-NEXT:    vpor %xmm9, %xmm8, %xmm8
3660; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm8 {%k3} = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
3661; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[2,7,12]
3662; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
3663; AVX512BW-NEXT:    vpor %xmm5, %xmm9, %xmm5
3664; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
3665; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7],ymm8[8,9,10,11,12],ymm5[13,14,15]
3666; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
3667; AVX512BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm8 {%k2}
3668; AVX512BW-NEXT:    vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
3669; AVX512BW-NEXT:    movl $138543104, %eax # imm = 0x8420000
3670; AVX512BW-NEXT:    kmovd %eax, %k5
3671; AVX512BW-NEXT:    vmovdqu8 %ymm9, %ymm8 {%k5}
3672; AVX512BW-NEXT:    vpblendmw %ymm3, %ymm2, %ymm9 {%k4}
3673; AVX512BW-NEXT:    vextracti128 $1, %ymm9, %xmm10
3674; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u]
3675; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u]
3676; AVX512BW-NEXT:    vpor %xmm10, %xmm9, %xmm9
3677; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm9 {%k3} = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
3678; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[3,8,13]
3679; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
3680; AVX512BW-NEXT:    vpor %xmm8, %xmm10, %xmm8
3681; AVX512BW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
3682; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15]
3683; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
3684; AVX512BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm9 {%k1}
3685; AVX512BW-NEXT:    vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
3686; AVX512BW-NEXT:    movl $277086208, %eax # imm = 0x10840000
3687; AVX512BW-NEXT:    kmovd %eax, %k3
3688; AVX512BW-NEXT:    vmovdqu8 %ymm10, %ymm9 {%k3}
3689; AVX512BW-NEXT:    vpblendmw %ymm3, %ymm2, %ymm10 {%k2}
3690; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u]
3691; AVX512BW-NEXT:    vextracti128 $1, %ymm10, %xmm10
3692; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15,u,u,u]
3693; AVX512BW-NEXT:    vpor %xmm11, %xmm10, %xmm10
3694; AVX512BW-NEXT:    movl $33546240, %eax # imm = 0x1FFE000
3695; AVX512BW-NEXT:    kmovd %eax, %k3
3696; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm10 {%k3} = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
3697; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14]
3698; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
3699; AVX512BW-NEXT:    vpor %xmm6, %xmm7, %xmm6
3700; AVX512BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
3701; AVX512BW-NEXT:    movl $-33554432, %eax # imm = 0xFE000000
3702; AVX512BW-NEXT:    kmovd %eax, %k3
3703; AVX512BW-NEXT:    vmovdqu8 %ymm6, %ymm10 {%k3}
3704; AVX512BW-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k1}
3705; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
3706; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
3707; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
3708; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
3709; AVX512BW-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k2}
3710; AVX512BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3711; AVX512BW-NEXT:    movl $554172416, %eax # imm = 0x21080000
3712; AVX512BW-NEXT:    kmovd %eax, %k1
3713; AVX512BW-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k1}
3714; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
3715; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
3716; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm1
3717; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
3718; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5]
3719; AVX512BW-NEXT:    vpermd %ymm1, %ymm2, %ymm1
3720; AVX512BW-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k3}
3721; AVX512BW-NEXT:    vmovdqa %ymm4, (%rsi)
3722; AVX512BW-NEXT:    vmovdqa %ymm5, (%rdx)
3723; AVX512BW-NEXT:    vmovdqa %ymm8, (%rcx)
3724; AVX512BW-NEXT:    vmovdqa %ymm10, (%r8)
3725; AVX512BW-NEXT:    vmovdqa %ymm0, (%r9)
3726; AVX512BW-NEXT:    vzeroupper
3727; AVX512BW-NEXT:    retq
3728;
3729; AVX512BW-FCP-LABEL: load_i8_stride5_vf32:
3730; AVX512BW-FCP:       # %bb.0:
3731; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm3
3732; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
3733; AVX512BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
3734; AVX512BW-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
3735; AVX512BW-FCP-NEXT:    movw $21140, %ax # imm = 0x5294
3736; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
3737; AVX512BW-FCP-NEXT:    vpblendmw %ymm1, %ymm0, %ymm4 {%k1}
3738; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
3739; AVX512BW-FCP-NEXT:    movl $1108344832, %eax # imm = 0x42100000
3740; AVX512BW-FCP-NEXT:    kmovd %eax, %k2
3741; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm5, %ymm4 {%k2}
3742; AVX512BW-FCP-NEXT:    movw $19026, %ax # imm = 0x4A52
3743; AVX512BW-FCP-NEXT:    kmovd %eax, %k2
3744; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm5 {%k2}
3745; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
3746; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
3747; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
3748; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
3749; AVX512BW-FCP-NEXT:    movl $67100672, %eax # imm = 0x3FFE000
3750; AVX512BW-FCP-NEXT:    kmovd %eax, %k3
3751; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
3752; AVX512BW-FCP-NEXT:    vmovdqa 144(%rdi), %xmm6
3753; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11]
3754; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %xmm7
3755; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
3756; AVX512BW-FCP-NEXT:    vpor %xmm4, %xmm8, %xmm4
3757; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
3758; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7],ymm5[8,9,10,11,12],ymm4[13,14,15]
3759; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
3760; AVX512BW-FCP-NEXT:    movw $10570, %ax # imm = 0x294A
3761; AVX512BW-FCP-NEXT:    kmovd %eax, %k4
3762; AVX512BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm5 {%k4}
3763; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1]
3764; AVX512BW-FCP-NEXT:    movl $-2078212096, %eax # imm = 0x84210000
3765; AVX512BW-FCP-NEXT:    kmovd %eax, %k5
3766; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm8, %ymm5 {%k5}
3767; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm8 {%k1}
3768; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm8[1,6,11],zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[u,u,u]
3769; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
3770; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13,u,u,u]
3771; AVX512BW-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm8
3772; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 {%k3} = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
3773; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[2,7,12]
3774; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
3775; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm9, %xmm5
3776; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
3777; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7],ymm8[8,9,10,11,12],ymm5[13,14,15]
3778; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
3779; AVX512BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm8 {%k2}
3780; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
3781; AVX512BW-FCP-NEXT:    movl $138543104, %eax # imm = 0x8420000
3782; AVX512BW-FCP-NEXT:    kmovd %eax, %k5
3783; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm9, %ymm8 {%k5}
3784; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm9 {%k4}
3785; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
3786; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u]
3787; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u]
3788; AVX512BW-FCP-NEXT:    vpor %xmm10, %xmm9, %xmm9
3789; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 {%k3} = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
3790; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[3,8,13]
3791; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
3792; AVX512BW-FCP-NEXT:    vpor %xmm8, %xmm10, %xmm8
3793; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
3794; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15]
3795; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
3796; AVX512BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm9 {%k1}
3797; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
3798; AVX512BW-FCP-NEXT:    movl $277086208, %eax # imm = 0x10840000
3799; AVX512BW-FCP-NEXT:    kmovd %eax, %k3
3800; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm10, %ymm9 {%k3}
3801; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm10 {%k2}
3802; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u]
3803; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm10
3804; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15,u,u,u]
3805; AVX512BW-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
3806; AVX512BW-FCP-NEXT:    movl $33546240, %eax # imm = 0x1FFE000
3807; AVX512BW-FCP-NEXT:    kmovd %eax, %k3
3808; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm10 {%k3} = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
3809; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14]
3810; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
3811; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
3812; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
3813; AVX512BW-FCP-NEXT:    movl $-33554432, %eax # imm = 0xFE000000
3814; AVX512BW-FCP-NEXT:    kmovd %eax, %k3
3815; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm6, %ymm10 {%k3}
3816; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k1}
3817; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
3818; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
3819; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
3820; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
3821; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k2}
3822; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3823; AVX512BW-FCP-NEXT:    movl $554172416, %eax # imm = 0x21080000
3824; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
3825; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k1}
3826; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
3827; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
3828; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
3829; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
3830; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5]
3831; AVX512BW-FCP-NEXT:    vpermd %ymm1, %ymm2, %ymm1
3832; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k3}
3833; AVX512BW-FCP-NEXT:    vmovdqa %ymm4, (%rsi)
3834; AVX512BW-FCP-NEXT:    vmovdqa %ymm5, (%rdx)
3835; AVX512BW-FCP-NEXT:    vmovdqa %ymm8, (%rcx)
3836; AVX512BW-FCP-NEXT:    vmovdqa %ymm10, (%r8)
3837; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, (%r9)
3838; AVX512BW-FCP-NEXT:    vzeroupper
3839; AVX512BW-FCP-NEXT:    retq
3840;
3841; AVX512DQ-BW-LABEL: load_i8_stride5_vf32:
3842; AVX512DQ-BW:       # %bb.0:
3843; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm3
3844; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm2
3845; AVX512DQ-BW-NEXT:    vmovdqa 64(%rdi), %ymm0
3846; AVX512DQ-BW-NEXT:    vmovdqa 96(%rdi), %ymm1
3847; AVX512DQ-BW-NEXT:    movw $21140, %ax # imm = 0x5294
3848; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
3849; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm0, %ymm4 {%k1}
3850; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
3851; AVX512DQ-BW-NEXT:    movl $1108344832, %eax # imm = 0x42100000
3852; AVX512DQ-BW-NEXT:    kmovd %eax, %k2
3853; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm5, %ymm4 {%k2}
3854; AVX512DQ-BW-NEXT:    movw $19026, %ax # imm = 0x4A52
3855; AVX512DQ-BW-NEXT:    kmovd %eax, %k2
3856; AVX512DQ-BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm5 {%k2}
3857; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm5, %xmm6
3858; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
3859; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
3860; AVX512DQ-BW-NEXT:    vpor %xmm6, %xmm5, %xmm5
3861; AVX512DQ-BW-NEXT:    movl $67100672, %eax # imm = 0x3FFE000
3862; AVX512DQ-BW-NEXT:    kmovd %eax, %k3
3863; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm5 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
3864; AVX512DQ-BW-NEXT:    vmovdqa 144(%rdi), %xmm6
3865; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11]
3866; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %xmm7
3867; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
3868; AVX512DQ-BW-NEXT:    vpor %xmm4, %xmm8, %xmm4
3869; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
3870; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7],ymm5[8,9,10,11,12],ymm4[13,14,15]
3871; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
3872; AVX512DQ-BW-NEXT:    movw $10570, %ax # imm = 0x294A
3873; AVX512DQ-BW-NEXT:    kmovd %eax, %k4
3874; AVX512DQ-BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm5 {%k4}
3875; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1]
3876; AVX512DQ-BW-NEXT:    movl $-2078212096, %eax # imm = 0x84210000
3877; AVX512DQ-BW-NEXT:    kmovd %eax, %k5
3878; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm8, %ymm5 {%k5}
3879; AVX512DQ-BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm8 {%k1}
3880; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm8[1,6,11],zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[u,u,u]
3881; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm8, %xmm8
3882; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13,u,u,u]
3883; AVX512DQ-BW-NEXT:    vpor %xmm9, %xmm8, %xmm8
3884; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm8 {%k3} = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
3885; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[2,7,12]
3886; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
3887; AVX512DQ-BW-NEXT:    vpor %xmm5, %xmm9, %xmm5
3888; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
3889; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7],ymm8[8,9,10,11,12],ymm5[13,14,15]
3890; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
3891; AVX512DQ-BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm8 {%k2}
3892; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
3893; AVX512DQ-BW-NEXT:    movl $138543104, %eax # imm = 0x8420000
3894; AVX512DQ-BW-NEXT:    kmovd %eax, %k5
3895; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm9, %ymm8 {%k5}
3896; AVX512DQ-BW-NEXT:    vpblendmw %ymm3, %ymm2, %ymm9 {%k4}
3897; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm9, %xmm10
3898; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u]
3899; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u]
3900; AVX512DQ-BW-NEXT:    vpor %xmm10, %xmm9, %xmm9
3901; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm9 {%k3} = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
3902; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[3,8,13]
3903; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
3904; AVX512DQ-BW-NEXT:    vpor %xmm8, %xmm10, %xmm8
3905; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
3906; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15]
3907; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
3908; AVX512DQ-BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm9 {%k1}
3909; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
3910; AVX512DQ-BW-NEXT:    movl $277086208, %eax # imm = 0x10840000
3911; AVX512DQ-BW-NEXT:    kmovd %eax, %k3
3912; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm10, %ymm9 {%k3}
3913; AVX512DQ-BW-NEXT:    vpblendmw %ymm3, %ymm2, %ymm10 {%k2}
3914; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u]
3915; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm10, %xmm10
3916; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15,u,u,u]
3917; AVX512DQ-BW-NEXT:    vpor %xmm11, %xmm10, %xmm10
3918; AVX512DQ-BW-NEXT:    movl $33546240, %eax # imm = 0x1FFE000
3919; AVX512DQ-BW-NEXT:    kmovd %eax, %k3
3920; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm10 {%k3} = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
3921; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14]
3922; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
3923; AVX512DQ-BW-NEXT:    vpor %xmm6, %xmm7, %xmm6
3924; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
3925; AVX512DQ-BW-NEXT:    movl $-33554432, %eax # imm = 0xFE000000
3926; AVX512DQ-BW-NEXT:    kmovd %eax, %k3
3927; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm6, %ymm10 {%k3}
3928; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k1}
3929; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
3930; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
3931; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
3932; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
3933; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k2}
3934; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3935; AVX512DQ-BW-NEXT:    movl $554172416, %eax # imm = 0x21080000
3936; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
3937; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k1}
3938; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
3939; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
3940; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %ymm1
3941; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
3942; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5]
3943; AVX512DQ-BW-NEXT:    vpermd %ymm1, %ymm2, %ymm1
3944; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k3}
3945; AVX512DQ-BW-NEXT:    vmovdqa %ymm4, (%rsi)
3946; AVX512DQ-BW-NEXT:    vmovdqa %ymm5, (%rdx)
3947; AVX512DQ-BW-NEXT:    vmovdqa %ymm8, (%rcx)
3948; AVX512DQ-BW-NEXT:    vmovdqa %ymm10, (%r8)
3949; AVX512DQ-BW-NEXT:    vmovdqa %ymm0, (%r9)
3950; AVX512DQ-BW-NEXT:    vzeroupper
3951; AVX512DQ-BW-NEXT:    retq
3952;
3953; AVX512DQ-BW-FCP-LABEL: load_i8_stride5_vf32:
3954; AVX512DQ-BW-FCP:       # %bb.0:
3955; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm3
3956; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
3957; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
3958; AVX512DQ-BW-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
3959; AVX512DQ-BW-FCP-NEXT:    movw $21140, %ax # imm = 0x5294
3960; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
3961; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm1, %ymm0, %ymm4 {%k1}
3962; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
3963; AVX512DQ-BW-FCP-NEXT:    movl $1108344832, %eax # imm = 0x42100000
3964; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k2
3965; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm5, %ymm4 {%k2}
3966; AVX512DQ-BW-FCP-NEXT:    movw $19026, %ax # imm = 0x4A52
3967; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k2
3968; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm5 {%k2}
3969; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
3970; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
3971; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
3972; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm5
3973; AVX512DQ-BW-FCP-NEXT:    movl $67100672, %eax # imm = 0x3FFE000
3974; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k3
3975; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
3976; AVX512DQ-BW-FCP-NEXT:    vmovdqa 144(%rdi), %xmm6
3977; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11]
3978; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %xmm7
3979; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
3980; AVX512DQ-BW-FCP-NEXT:    vpor %xmm4, %xmm8, %xmm4
3981; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
3982; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7],ymm5[8,9,10,11,12],ymm4[13,14,15]
3983; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
3984; AVX512DQ-BW-FCP-NEXT:    movw $10570, %ax # imm = 0x294A
3985; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k4
3986; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm5 {%k4}
3987; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1]
3988; AVX512DQ-BW-FCP-NEXT:    movl $-2078212096, %eax # imm = 0x84210000
3989; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k5
3990; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm8, %ymm5 {%k5}
3991; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm8 {%k1}
3992; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm8[1,6,11],zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[u,u,u]
3993; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
3994; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13,u,u,u]
3995; AVX512DQ-BW-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm8
3996; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 {%k3} = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
3997; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[2,7,12]
3998; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
3999; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm9, %xmm5
4000; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
4001; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7],ymm8[8,9,10,11,12],ymm5[13,14,15]
4002; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
4003; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm8 {%k2}
4004; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
4005; AVX512DQ-BW-FCP-NEXT:    movl $138543104, %eax # imm = 0x8420000
4006; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k5
4007; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm9, %ymm8 {%k5}
4008; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm9 {%k4}
4009; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
4010; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u]
4011; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u]
4012; AVX512DQ-BW-FCP-NEXT:    vpor %xmm10, %xmm9, %xmm9
4013; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 {%k3} = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
4014; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[3,8,13]
4015; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
4016; AVX512DQ-BW-FCP-NEXT:    vpor %xmm8, %xmm10, %xmm8
4017; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
4018; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15]
4019; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
4020; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm9 {%k1}
4021; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
4022; AVX512DQ-BW-FCP-NEXT:    movl $277086208, %eax # imm = 0x10840000
4023; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k3
4024; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm10, %ymm9 {%k3}
4025; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm10 {%k2}
4026; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u]
4027; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm10
4028; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15,u,u,u]
4029; AVX512DQ-BW-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
4030; AVX512DQ-BW-FCP-NEXT:    movl $33546240, %eax # imm = 0x1FFE000
4031; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k3
4032; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm10 {%k3} = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
4033; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14]
4034; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
4035; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
4036; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
4037; AVX512DQ-BW-FCP-NEXT:    movl $-33554432, %eax # imm = 0xFE000000
4038; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k3
4039; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm6, %ymm10 {%k3}
4040; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k1}
4041; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
4042; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
4043; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
4044; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
4045; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k2}
4046; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
4047; AVX512DQ-BW-FCP-NEXT:    movl $554172416, %eax # imm = 0x21080000
4048; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
4049; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k1}
4050; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
4051; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
4052; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
4053; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
4054; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5]
4055; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm1, %ymm2, %ymm1
4056; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k3}
4057; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm4, (%rsi)
4058; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm5, (%rdx)
4059; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm8, (%rcx)
4060; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm10, (%r8)
4061; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, (%r9)
4062; AVX512DQ-BW-FCP-NEXT:    vzeroupper
4063; AVX512DQ-BW-FCP-NEXT:    retq
4064  %wide.vec = load <160 x i8>, ptr %in.vec, align 64
4065  %strided.vec0 = shufflevector <160 x i8> %wide.vec, <160 x i8> poison, <32 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75, i32 80, i32 85, i32 90, i32 95, i32 100, i32 105, i32 110, i32 115, i32 120, i32 125, i32 130, i32 135, i32 140, i32 145, i32 150, i32 155>
4066  %strided.vec1 = shufflevector <160 x i8> %wide.vec, <160 x i8> poison, <32 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36, i32 41, i32 46, i32 51, i32 56, i32 61, i32 66, i32 71, i32 76, i32 81, i32 86, i32 91, i32 96, i32 101, i32 106, i32 111, i32 116, i32 121, i32 126, i32 131, i32 136, i32 141, i32 146, i32 151, i32 156>
4067  %strided.vec2 = shufflevector <160 x i8> %wide.vec, <160 x i8> poison, <32 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37, i32 42, i32 47, i32 52, i32 57, i32 62, i32 67, i32 72, i32 77, i32 82, i32 87, i32 92, i32 97, i32 102, i32 107, i32 112, i32 117, i32 122, i32 127, i32 132, i32 137, i32 142, i32 147, i32 152, i32 157>
4068  %strided.vec3 = shufflevector <160 x i8> %wide.vec, <160 x i8> poison, <32 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38, i32 43, i32 48, i32 53, i32 58, i32 63, i32 68, i32 73, i32 78, i32 83, i32 88, i32 93, i32 98, i32 103, i32 108, i32 113, i32 118, i32 123, i32 128, i32 133, i32 138, i32 143, i32 148, i32 153, i32 158>
4069  %strided.vec4 = shufflevector <160 x i8> %wide.vec, <160 x i8> poison, <32 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39, i32 44, i32 49, i32 54, i32 59, i32 64, i32 69, i32 74, i32 79, i32 84, i32 89, i32 94, i32 99, i32 104, i32 109, i32 114, i32 119, i32 124, i32 129, i32 134, i32 139, i32 144, i32 149, i32 154, i32 159>
4070  store <32 x i8> %strided.vec0, ptr %out.vec0, align 64
4071  store <32 x i8> %strided.vec1, ptr %out.vec1, align 64
4072  store <32 x i8> %strided.vec2, ptr %out.vec2, align 64
4073  store <32 x i8> %strided.vec3, ptr %out.vec3, align 64
4074  store <32 x i8> %strided.vec4, ptr %out.vec4, align 64
4075  ret void
4076}
4077
4078define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
4079; SSE-LABEL: load_i8_stride5_vf64:
4080; SSE:       # %bb.0:
4081; SSE-NEXT:    subq $552, %rsp # imm = 0x228
4082; SSE-NEXT:    movdqa 160(%rdi), %xmm9
4083; SSE-NEXT:    movdqa 176(%rdi), %xmm3
4084; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4085; SSE-NEXT:    movdqa 208(%rdi), %xmm4
4086; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4087; SSE-NEXT:    movdqa 192(%rdi), %xmm1
4088; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4089; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
4090; SSE-NEXT:    movdqa %xmm2, %xmm0
4091; SSE-NEXT:    pandn %xmm1, %xmm0
4092; SSE-NEXT:    movdqa %xmm4, %xmm1
4093; SSE-NEXT:    pand %xmm2, %xmm1
4094; SSE-NEXT:    movdqa %xmm2, %xmm14
4095; SSE-NEXT:    por %xmm0, %xmm1
4096; SSE-NEXT:    pxor %xmm12, %xmm12
4097; SSE-NEXT:    movdqa %xmm1, %xmm0
4098; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
4099; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
4100; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
4101; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15]
4102; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4103; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
4104; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
4105; SSE-NEXT:    packuswb %xmm1, %xmm0
4106; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3]
4107; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255]
4108; SSE-NEXT:    movdqa %xmm11, %xmm1
4109; SSE-NEXT:    pandn %xmm0, %xmm1
4110; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
4111; SSE-NEXT:    movdqa %xmm10, %xmm0
4112; SSE-NEXT:    pandn %xmm3, %xmm0
4113; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
4114; SSE-NEXT:    movdqa %xmm2, %xmm3
4115; SSE-NEXT:    movdqa %xmm2, %xmm4
4116; SSE-NEXT:    pandn %xmm9, %xmm3
4117; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4118; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
4119; SSE-NEXT:    movdqa %xmm7, %xmm3
4120; SSE-NEXT:    pandn %xmm9, %xmm3
4121; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4122; SSE-NEXT:    movdqa %xmm14, %xmm2
4123; SSE-NEXT:    pandn %xmm9, %xmm2
4124; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4125; SSE-NEXT:    movdqa %xmm10, %xmm2
4126; SSE-NEXT:    pandn %xmm9, %xmm2
4127; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4128; SSE-NEXT:    pand %xmm10, %xmm9
4129; SSE-NEXT:    por %xmm0, %xmm9
4130; SSE-NEXT:    movdqa %xmm9, %xmm0
4131; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
4132; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [0,65535,65535,65535,0,0,65535,65535]
4133; SSE-NEXT:    movdqa %xmm8, %xmm2
4134; SSE-NEXT:    pandn %xmm0, %xmm2
4135; SSE-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15]
4136; SSE-NEXT:    pand %xmm8, %xmm9
4137; SSE-NEXT:    por %xmm2, %xmm9
4138; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm9[0,2,1,3,4,5,6,7]
4139; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
4140; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
4141; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
4142; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
4143; SSE-NEXT:    packuswb %xmm0, %xmm0
4144; SSE-NEXT:    pand %xmm11, %xmm0
4145; SSE-NEXT:    por %xmm1, %xmm0
4146; SSE-NEXT:    movdqa 224(%rdi), %xmm3
4147; SSE-NEXT:    movdqa %xmm3, %xmm2
4148; SSE-NEXT:    pxor %xmm1, %xmm1
4149; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
4150; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4151; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
4152; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4153; SSE-NEXT:    pxor %xmm9, %xmm9
4154; SSE-NEXT:    movdqa %xmm3, %xmm1
4155; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0]
4156; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3]
4157; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
4158; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
4159; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4]
4160; SSE-NEXT:    packuswb %xmm1, %xmm1
4161; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0]
4162; SSE-NEXT:    movdqa %xmm6, %xmm2
4163; SSE-NEXT:    pandn %xmm1, %xmm2
4164; SSE-NEXT:    pand %xmm6, %xmm0
4165; SSE-NEXT:    por %xmm0, %xmm2
4166; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4167; SSE-NEXT:    movdqa 32(%rdi), %xmm1
4168; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4169; SSE-NEXT:    movdqa %xmm14, %xmm0
4170; SSE-NEXT:    pandn %xmm1, %xmm0
4171; SSE-NEXT:    movdqa 48(%rdi), %xmm15
4172; SSE-NEXT:    movdqa %xmm15, %xmm1
4173; SSE-NEXT:    pand %xmm14, %xmm1
4174; SSE-NEXT:    por %xmm0, %xmm1
4175; SSE-NEXT:    movdqa %xmm1, %xmm0
4176; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
4177; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
4178; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
4179; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
4180; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4181; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
4182; SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
4183; SSE-NEXT:    packuswb %xmm1, %xmm0
4184; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3]
4185; SSE-NEXT:    movdqa %xmm11, %xmm1
4186; SSE-NEXT:    pandn %xmm0, %xmm1
4187; SSE-NEXT:    movdqa 16(%rdi), %xmm0
4188; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4189; SSE-NEXT:    movdqa %xmm10, %xmm2
4190; SSE-NEXT:    pandn %xmm0, %xmm2
4191; SSE-NEXT:    movdqa (%rdi), %xmm3
4192; SSE-NEXT:    movdqa %xmm4, %xmm0
4193; SSE-NEXT:    pandn %xmm3, %xmm4
4194; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4195; SSE-NEXT:    movdqa %xmm7, %xmm4
4196; SSE-NEXT:    pandn %xmm3, %xmm4
4197; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4198; SSE-NEXT:    movdqa %xmm14, %xmm4
4199; SSE-NEXT:    pandn %xmm3, %xmm4
4200; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4201; SSE-NEXT:    movdqa %xmm10, %xmm4
4202; SSE-NEXT:    pandn %xmm3, %xmm4
4203; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4204; SSE-NEXT:    pand %xmm10, %xmm3
4205; SSE-NEXT:    por %xmm2, %xmm3
4206; SSE-NEXT:    movdqa %xmm3, %xmm2
4207; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
4208; SSE-NEXT:    movdqa %xmm8, %xmm4
4209; SSE-NEXT:    pandn %xmm2, %xmm4
4210; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
4211; SSE-NEXT:    pand %xmm8, %xmm3
4212; SSE-NEXT:    por %xmm4, %xmm3
4213; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm3[0,2,1,3,4,5,6,7]
4214; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7]
4215; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
4216; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7]
4217; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7]
4218; SSE-NEXT:    packuswb %xmm2, %xmm2
4219; SSE-NEXT:    pand %xmm11, %xmm2
4220; SSE-NEXT:    por %xmm1, %xmm2
4221; SSE-NEXT:    movdqa 64(%rdi), %xmm1
4222; SSE-NEXT:    movdqa %xmm1, %xmm3
4223; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
4224; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4225; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
4226; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4227; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[0,0]
4228; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,3]
4229; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
4230; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
4231; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4]
4232; SSE-NEXT:    packuswb %xmm1, %xmm1
4233; SSE-NEXT:    movdqa %xmm6, %xmm3
4234; SSE-NEXT:    pandn %xmm1, %xmm3
4235; SSE-NEXT:    pand %xmm6, %xmm2
4236; SSE-NEXT:    por %xmm2, %xmm3
4237; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4238; SSE-NEXT:    movdqa 272(%rdi), %xmm2
4239; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4240; SSE-NEXT:    movdqa %xmm14, %xmm1
4241; SSE-NEXT:    pandn %xmm2, %xmm1
4242; SSE-NEXT:    movdqa 288(%rdi), %xmm13
4243; SSE-NEXT:    movdqa %xmm13, %xmm2
4244; SSE-NEXT:    pand %xmm14, %xmm2
4245; SSE-NEXT:    por %xmm1, %xmm2
4246; SSE-NEXT:    movdqa %xmm2, %xmm1
4247; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
4248; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3]
4249; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
4250; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
4251; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
4252; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
4253; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
4254; SSE-NEXT:    packuswb %xmm2, %xmm1
4255; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,3]
4256; SSE-NEXT:    movdqa %xmm11, %xmm2
4257; SSE-NEXT:    pandn %xmm1, %xmm2
4258; SSE-NEXT:    movdqa 256(%rdi), %xmm1
4259; SSE-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
4260; SSE-NEXT:    movdqa %xmm10, %xmm4
4261; SSE-NEXT:    pandn %xmm1, %xmm4
4262; SSE-NEXT:    movdqa 240(%rdi), %xmm3
4263; SSE-NEXT:    movdqa %xmm0, %xmm1
4264; SSE-NEXT:    pandn %xmm3, %xmm1
4265; SSE-NEXT:    pandn %xmm3, %xmm7
4266; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4267; SSE-NEXT:    movdqa %xmm14, %xmm7
4268; SSE-NEXT:    pandn %xmm3, %xmm7
4269; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4270; SSE-NEXT:    movdqa %xmm10, %xmm7
4271; SSE-NEXT:    pandn %xmm3, %xmm7
4272; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4273; SSE-NEXT:    pand %xmm10, %xmm3
4274; SSE-NEXT:    por %xmm4, %xmm3
4275; SSE-NEXT:    movdqa %xmm3, %xmm4
4276; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
4277; SSE-NEXT:    movdqa %xmm8, %xmm7
4278; SSE-NEXT:    pandn %xmm4, %xmm7
4279; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
4280; SSE-NEXT:    pand %xmm8, %xmm3
4281; SSE-NEXT:    por %xmm7, %xmm3
4282; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,2,1,3,4,5,6,7]
4283; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,7]
4284; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1]
4285; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,2,1,3,4,5,6,7]
4286; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,7]
4287; SSE-NEXT:    packuswb %xmm3, %xmm3
4288; SSE-NEXT:    pand %xmm11, %xmm3
4289; SSE-NEXT:    por %xmm2, %xmm3
4290; SSE-NEXT:    movdqa 304(%rdi), %xmm2
4291; SSE-NEXT:    movdqa %xmm2, %xmm4
4292; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
4293; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4294; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
4295; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4296; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[0,0]
4297; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,3]
4298; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
4299; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
4300; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4]
4301; SSE-NEXT:    packuswb %xmm2, %xmm2
4302; SSE-NEXT:    movdqa %xmm6, %xmm4
4303; SSE-NEXT:    pandn %xmm2, %xmm4
4304; SSE-NEXT:    pand %xmm6, %xmm3
4305; SSE-NEXT:    por %xmm3, %xmm4
4306; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4307; SSE-NEXT:    movdqa 112(%rdi), %xmm3
4308; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4309; SSE-NEXT:    movdqa %xmm14, %xmm2
4310; SSE-NEXT:    pandn %xmm3, %xmm2
4311; SSE-NEXT:    movdqa 128(%rdi), %xmm3
4312; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4313; SSE-NEXT:    pand %xmm14, %xmm3
4314; SSE-NEXT:    por %xmm2, %xmm3
4315; SSE-NEXT:    movdqa %xmm3, %xmm2
4316; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
4317; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,1,1,3]
4318; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
4319; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
4320; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
4321; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
4322; SSE-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
4323; SSE-NEXT:    packuswb %xmm3, %xmm2
4324; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,2,3]
4325; SSE-NEXT:    movdqa %xmm11, %xmm3
4326; SSE-NEXT:    pandn %xmm2, %xmm3
4327; SSE-NEXT:    movdqa 96(%rdi), %xmm4
4328; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4329; SSE-NEXT:    movdqa %xmm10, %xmm2
4330; SSE-NEXT:    pandn %xmm4, %xmm2
4331; SSE-NEXT:    movdqa 80(%rdi), %xmm4
4332; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4333; SSE-NEXT:    pand %xmm10, %xmm4
4334; SSE-NEXT:    por %xmm2, %xmm4
4335; SSE-NEXT:    movdqa %xmm4, %xmm2
4336; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
4337; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
4338; SSE-NEXT:    pand %xmm8, %xmm4
4339; SSE-NEXT:    pandn %xmm2, %xmm8
4340; SSE-NEXT:    por %xmm4, %xmm8
4341; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm8[0,2,1,3,4,5,6,7]
4342; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7]
4343; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
4344; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7]
4345; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7]
4346; SSE-NEXT:    packuswb %xmm2, %xmm2
4347; SSE-NEXT:    pand %xmm11, %xmm2
4348; SSE-NEXT:    por %xmm3, %xmm2
4349; SSE-NEXT:    movdqa 144(%rdi), %xmm12
4350; SSE-NEXT:    movdqa %xmm12, %xmm4
4351; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
4352; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4353; SSE-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15]
4354; SSE-NEXT:    movdqa %xmm12, %xmm3
4355; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4356; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm4[0,0]
4357; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,3]
4358; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
4359; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
4360; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,4]
4361; SSE-NEXT:    packuswb %xmm3, %xmm3
4362; SSE-NEXT:    movdqa %xmm6, %xmm14
4363; SSE-NEXT:    movdqa %xmm6, %xmm4
4364; SSE-NEXT:    pandn %xmm3, %xmm4
4365; SSE-NEXT:    pand %xmm6, %xmm2
4366; SSE-NEXT:    por %xmm2, %xmm4
4367; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4368; SSE-NEXT:    movdqa %xmm10, %xmm2
4369; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4370; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4371; SSE-NEXT:    pand %xmm10, %xmm3
4372; SSE-NEXT:    por %xmm2, %xmm3
4373; SSE-NEXT:    movdqa %xmm3, %xmm2
4374; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
4375; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
4376; SSE-NEXT:    movdqa %xmm3, %xmm4
4377; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[0,0]
4378; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[2,3]
4379; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2,1,3]
4380; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,7]
4381; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,1]
4382; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
4383; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
4384; SSE-NEXT:    psllq $48, %xmm3
4385; SSE-NEXT:    packuswb %xmm2, %xmm3
4386; SSE-NEXT:    movdqa %xmm11, %xmm4
4387; SSE-NEXT:    pandn %xmm3, %xmm4
4388; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4389; SSE-NEXT:    movdqa %xmm6, %xmm3
4390; SSE-NEXT:    pand %xmm0, %xmm3
4391; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4392; SSE-NEXT:    movdqa %xmm3, %xmm7
4393; SSE-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15]
4394; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,0,0,65535,65535,65535,0]
4395; SSE-NEXT:    movdqa %xmm2, %xmm8
4396; SSE-NEXT:    pandn %xmm7, %xmm8
4397; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
4398; SSE-NEXT:    pand %xmm2, %xmm3
4399; SSE-NEXT:    por %xmm8, %xmm3
4400; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3]
4401; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
4402; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1]
4403; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7]
4404; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,4,5,7]
4405; SSE-NEXT:    packuswb %xmm3, %xmm3
4406; SSE-NEXT:    pand %xmm11, %xmm3
4407; SSE-NEXT:    por %xmm4, %xmm3
4408; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4409; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4410; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[3,0]
4411; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[0,2]
4412; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,7,6,7]
4413; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
4414; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4]
4415; SSE-NEXT:    packuswb %xmm4, %xmm4
4416; SSE-NEXT:    movdqa %xmm14, %xmm7
4417; SSE-NEXT:    pandn %xmm4, %xmm7
4418; SSE-NEXT:    pand %xmm14, %xmm3
4419; SSE-NEXT:    por %xmm3, %xmm7
4420; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4421; SSE-NEXT:    movdqa %xmm10, %xmm3
4422; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4423; SSE-NEXT:    movdqa %xmm15, %xmm4
4424; SSE-NEXT:    pand %xmm10, %xmm4
4425; SSE-NEXT:    movdqa %xmm10, %xmm5
4426; SSE-NEXT:    por %xmm3, %xmm4
4427; SSE-NEXT:    movdqa %xmm4, %xmm3
4428; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
4429; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
4430; SSE-NEXT:    movdqa %xmm4, %xmm7
4431; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,0],xmm3[0,0]
4432; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[2,3]
4433; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,2,1,3]
4434; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,6,5,6,7]
4435; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,1]
4436; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7]
4437; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
4438; SSE-NEXT:    psllq $48, %xmm4
4439; SSE-NEXT:    packuswb %xmm3, %xmm4
4440; SSE-NEXT:    movdqa %xmm11, %xmm3
4441; SSE-NEXT:    pandn %xmm4, %xmm3
4442; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4443; SSE-NEXT:    movdqa %xmm10, %xmm4
4444; SSE-NEXT:    movdqa %xmm0, %xmm8
4445; SSE-NEXT:    pand %xmm0, %xmm4
4446; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4447; SSE-NEXT:    movdqa %xmm4, %xmm0
4448; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
4449; SSE-NEXT:    movdqa %xmm2, %xmm7
4450; SSE-NEXT:    pandn %xmm0, %xmm7
4451; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
4452; SSE-NEXT:    pand %xmm2, %xmm4
4453; SSE-NEXT:    por %xmm7, %xmm4
4454; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,2,1,3]
4455; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
4456; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
4457; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
4458; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,7]
4459; SSE-NEXT:    packuswb %xmm0, %xmm0
4460; SSE-NEXT:    pand %xmm11, %xmm0
4461; SSE-NEXT:    por %xmm3, %xmm0
4462; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4463; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4464; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[3,0]
4465; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2]
4466; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,7,6,7]
4467; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
4468; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4]
4469; SSE-NEXT:    packuswb %xmm3, %xmm3
4470; SSE-NEXT:    movdqa %xmm14, %xmm4
4471; SSE-NEXT:    pandn %xmm3, %xmm4
4472; SSE-NEXT:    pand %xmm14, %xmm0
4473; SSE-NEXT:    por %xmm0, %xmm4
4474; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4475; SSE-NEXT:    movdqa %xmm5, %xmm0
4476; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4477; SSE-NEXT:    movdqa %xmm13, %xmm3
4478; SSE-NEXT:    pand %xmm5, %xmm3
4479; SSE-NEXT:    por %xmm0, %xmm3
4480; SSE-NEXT:    movdqa %xmm3, %xmm0
4481; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
4482; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
4483; SSE-NEXT:    movdqa %xmm3, %xmm4
4484; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0]
4485; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3]
4486; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2,1,3]
4487; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,6,7]
4488; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,1]
4489; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
4490; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
4491; SSE-NEXT:    psllq $48, %xmm3
4492; SSE-NEXT:    packuswb %xmm0, %xmm3
4493; SSE-NEXT:    movdqa %xmm11, %xmm0
4494; SSE-NEXT:    pandn %xmm3, %xmm0
4495; SSE-NEXT:    movdqa (%rsp), %xmm3 # 16-byte Reload
4496; SSE-NEXT:    pand %xmm8, %xmm3
4497; SSE-NEXT:    movdqa %xmm8, %xmm7
4498; SSE-NEXT:    por %xmm1, %xmm3
4499; SSE-NEXT:    movdqa %xmm3, %xmm1
4500; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
4501; SSE-NEXT:    movdqa %xmm2, %xmm4
4502; SSE-NEXT:    pandn %xmm1, %xmm4
4503; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
4504; SSE-NEXT:    pand %xmm2, %xmm3
4505; SSE-NEXT:    por %xmm4, %xmm3
4506; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3]
4507; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
4508; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
4509; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
4510; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,5,7]
4511; SSE-NEXT:    packuswb %xmm1, %xmm1
4512; SSE-NEXT:    pand %xmm11, %xmm1
4513; SSE-NEXT:    por %xmm0, %xmm1
4514; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4515; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4516; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0]
4517; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2]
4518; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,7,6,7]
4519; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
4520; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
4521; SSE-NEXT:    packuswb %xmm0, %xmm0
4522; SSE-NEXT:    movdqa %xmm14, %xmm3
4523; SSE-NEXT:    pandn %xmm0, %xmm3
4524; SSE-NEXT:    pand %xmm14, %xmm1
4525; SSE-NEXT:    por %xmm1, %xmm3
4526; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4527; SSE-NEXT:    movdqa %xmm5, %xmm1
4528; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4529; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4530; SSE-NEXT:    pand %xmm5, %xmm0
4531; SSE-NEXT:    movdqa %xmm5, %xmm8
4532; SSE-NEXT:    por %xmm1, %xmm0
4533; SSE-NEXT:    movdqa %xmm0, %xmm1
4534; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
4535; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
4536; SSE-NEXT:    movdqa %xmm0, %xmm3
4537; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0]
4538; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3]
4539; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
4540; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,6,7]
4541; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,1]
4542; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
4543; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
4544; SSE-NEXT:    psllq $48, %xmm0
4545; SSE-NEXT:    packuswb %xmm1, %xmm0
4546; SSE-NEXT:    movdqa %xmm7, %xmm1
4547; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4548; SSE-NEXT:    pandn %xmm5, %xmm1
4549; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4550; SSE-NEXT:    pand %xmm7, %xmm3
4551; SSE-NEXT:    por %xmm1, %xmm3
4552; SSE-NEXT:    movdqa %xmm3, %xmm1
4553; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
4554; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
4555; SSE-NEXT:    pand %xmm2, %xmm3
4556; SSE-NEXT:    pandn %xmm1, %xmm2
4557; SSE-NEXT:    por %xmm3, %xmm2
4558; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3]
4559; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
4560; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
4561; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
4562; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,5,7]
4563; SSE-NEXT:    packuswb %xmm1, %xmm1
4564; SSE-NEXT:    pand %xmm11, %xmm1
4565; SSE-NEXT:    pandn %xmm0, %xmm11
4566; SSE-NEXT:    por %xmm11, %xmm1
4567; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4568; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[2,0],xmm2[3,0]
4569; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm12[0,2]
4570; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,7,6,7]
4571; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
4572; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
4573; SSE-NEXT:    packuswb %xmm0, %xmm0
4574; SSE-NEXT:    movdqa %xmm14, %xmm2
4575; SSE-NEXT:    pandn %xmm0, %xmm2
4576; SSE-NEXT:    pand %xmm14, %xmm1
4577; SSE-NEXT:    por %xmm1, %xmm2
4578; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4579; SSE-NEXT:    movdqa %xmm6, %xmm1
4580; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
4581; SSE-NEXT:    pand %xmm11, %xmm1
4582; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4583; SSE-NEXT:    movdqa %xmm1, %xmm2
4584; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
4585; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535]
4586; SSE-NEXT:    movdqa %xmm6, %xmm3
4587; SSE-NEXT:    pandn %xmm2, %xmm3
4588; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
4589; SSE-NEXT:    pand %xmm6, %xmm1
4590; SSE-NEXT:    por %xmm3, %xmm1
4591; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
4592; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
4593; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
4594; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
4595; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
4596; SSE-NEXT:    packuswb %xmm1, %xmm1
4597; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [0,0,0,65535,65535,65535,65535,65535]
4598; SSE-NEXT:    movdqa %xmm12, %xmm2
4599; SSE-NEXT:    pandn %xmm1, %xmm2
4600; SSE-NEXT:    movdqa %xmm8, %xmm1
4601; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4602; SSE-NEXT:    movdqa %xmm7, %xmm0
4603; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4604; SSE-NEXT:    pandn %xmm4, %xmm0
4605; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4606; SSE-NEXT:    movdqa %xmm11, %xmm3
4607; SSE-NEXT:    pandn %xmm4, %xmm3
4608; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4609; SSE-NEXT:    pand %xmm8, %xmm4
4610; SSE-NEXT:    por %xmm1, %xmm4
4611; SSE-NEXT:    movdqa %xmm4, %xmm1
4612; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
4613; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
4614; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,1,2,0]
4615; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0]
4616; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,2]
4617; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
4618; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
4619; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4620; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
4621; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
4622; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,6,5]
4623; SSE-NEXT:    packuswb %xmm1, %xmm4
4624; SSE-NEXT:    pand %xmm12, %xmm4
4625; SSE-NEXT:    por %xmm2, %xmm4
4626; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4627; SSE-NEXT:    # xmm1 = mem[1,1,1,1]
4628; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4629; SSE-NEXT:    # xmm2 = mem[0,2,2,3]
4630; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4631; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7]
4632; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4633; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
4634; SSE-NEXT:    packuswb %xmm1, %xmm1
4635; SSE-NEXT:    movdqa %xmm14, %xmm3
4636; SSE-NEXT:    pandn %xmm1, %xmm3
4637; SSE-NEXT:    pand %xmm14, %xmm4
4638; SSE-NEXT:    por %xmm4, %xmm3
4639; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4640; SSE-NEXT:    pand %xmm11, %xmm10
4641; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
4642; SSE-NEXT:    movdqa %xmm10, %xmm2
4643; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
4644; SSE-NEXT:    movdqa %xmm6, %xmm4
4645; SSE-NEXT:    pandn %xmm2, %xmm4
4646; SSE-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
4647; SSE-NEXT:    pand %xmm6, %xmm10
4648; SSE-NEXT:    por %xmm4, %xmm10
4649; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm10[2,1,2,3,4,5,6,7]
4650; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
4651; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
4652; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
4653; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
4654; SSE-NEXT:    packuswb %xmm1, %xmm1
4655; SSE-NEXT:    movdqa %xmm12, %xmm2
4656; SSE-NEXT:    pandn %xmm1, %xmm2
4657; SSE-NEXT:    movdqa %xmm8, %xmm1
4658; SSE-NEXT:    pandn %xmm15, %xmm1
4659; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
4660; SSE-NEXT:    movdqa %xmm10, %xmm0
4661; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4662; SSE-NEXT:    pandn %xmm4, %xmm0
4663; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4664; SSE-NEXT:    movdqa %xmm11, %xmm3
4665; SSE-NEXT:    pandn %xmm4, %xmm3
4666; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4667; SSE-NEXT:    pand %xmm8, %xmm4
4668; SSE-NEXT:    por %xmm1, %xmm4
4669; SSE-NEXT:    movdqa %xmm4, %xmm1
4670; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
4671; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
4672; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,1,2,0]
4673; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0]
4674; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,2]
4675; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
4676; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
4677; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4678; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
4679; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
4680; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,6,5]
4681; SSE-NEXT:    packuswb %xmm1, %xmm4
4682; SSE-NEXT:    pand %xmm12, %xmm4
4683; SSE-NEXT:    por %xmm2, %xmm4
4684; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4685; SSE-NEXT:    # xmm1 = mem[1,1,1,1]
4686; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4687; SSE-NEXT:    # xmm2 = mem[0,2,2,3]
4688; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4689; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7]
4690; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4691; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
4692; SSE-NEXT:    packuswb %xmm1, %xmm1
4693; SSE-NEXT:    movdqa %xmm14, %xmm2
4694; SSE-NEXT:    pandn %xmm1, %xmm2
4695; SSE-NEXT:    pand %xmm14, %xmm4
4696; SSE-NEXT:    por %xmm4, %xmm2
4697; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4698; SSE-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
4699; SSE-NEXT:    pand %xmm11, %xmm1
4700; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4701; SSE-NEXT:    movdqa %xmm1, %xmm2
4702; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
4703; SSE-NEXT:    movdqa %xmm6, %xmm4
4704; SSE-NEXT:    pandn %xmm2, %xmm4
4705; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
4706; SSE-NEXT:    pand %xmm6, %xmm1
4707; SSE-NEXT:    por %xmm4, %xmm1
4708; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
4709; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
4710; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
4711; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
4712; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
4713; SSE-NEXT:    packuswb %xmm1, %xmm1
4714; SSE-NEXT:    movdqa %xmm12, %xmm2
4715; SSE-NEXT:    pandn %xmm1, %xmm2
4716; SSE-NEXT:    movdqa %xmm8, %xmm4
4717; SSE-NEXT:    pandn %xmm13, %xmm4
4718; SSE-NEXT:    movdqa %xmm10, %xmm0
4719; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4720; SSE-NEXT:    pandn %xmm7, %xmm0
4721; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4722; SSE-NEXT:    movdqa %xmm11, %xmm1
4723; SSE-NEXT:    pandn %xmm7, %xmm1
4724; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4725; SSE-NEXT:    pand %xmm8, %xmm7
4726; SSE-NEXT:    movdqa %xmm8, %xmm10
4727; SSE-NEXT:    por %xmm4, %xmm7
4728; SSE-NEXT:    movdqa %xmm7, %xmm4
4729; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
4730; SSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
4731; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,1,2,0]
4732; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[3,0]
4733; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm7[0,2]
4734; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
4735; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
4736; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
4737; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
4738; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6]
4739; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5,6,5]
4740; SSE-NEXT:    packuswb %xmm4, %xmm7
4741; SSE-NEXT:    pand %xmm12, %xmm7
4742; SSE-NEXT:    por %xmm2, %xmm7
4743; SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4744; SSE-NEXT:    # xmm2 = mem[1,1,1,1]
4745; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4746; SSE-NEXT:    # xmm4 = mem[0,2,2,3]
4747; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
4748; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm4[0,3,2,3,4,5,6,7]
4749; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
4750; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
4751; SSE-NEXT:    packuswb %xmm2, %xmm2
4752; SSE-NEXT:    movdqa %xmm14, %xmm1
4753; SSE-NEXT:    pandn %xmm2, %xmm1
4754; SSE-NEXT:    pand %xmm14, %xmm7
4755; SSE-NEXT:    por %xmm7, %xmm1
4756; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4757; SSE-NEXT:    movdqa %xmm11, %xmm8
4758; SSE-NEXT:    movdqa %xmm11, %xmm2
4759; SSE-NEXT:    pandn %xmm5, %xmm2
4760; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4761; SSE-NEXT:    pand %xmm11, %xmm4
4762; SSE-NEXT:    por %xmm2, %xmm4
4763; SSE-NEXT:    movdqa %xmm4, %xmm2
4764; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
4765; SSE-NEXT:    movdqa %xmm6, %xmm7
4766; SSE-NEXT:    pandn %xmm2, %xmm7
4767; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
4768; SSE-NEXT:    pand %xmm6, %xmm4
4769; SSE-NEXT:    por %xmm7, %xmm4
4770; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm4[2,1,2,3,4,5,6,7]
4771; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
4772; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
4773; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7]
4774; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,6,7]
4775; SSE-NEXT:    packuswb %xmm4, %xmm4
4776; SSE-NEXT:    movdqa %xmm12, %xmm3
4777; SSE-NEXT:    pandn %xmm4, %xmm3
4778; SSE-NEXT:    movdqa %xmm10, %xmm7
4779; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4780; SSE-NEXT:    pandn %xmm5, %xmm7
4781; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4782; SSE-NEXT:    movdqa %xmm0, %xmm14
4783; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
4784; SSE-NEXT:    pand %xmm1, %xmm14
4785; SSE-NEXT:    movdqa %xmm15, %xmm11
4786; SSE-NEXT:    pand %xmm1, %xmm11
4787; SSE-NEXT:    movdqa %xmm13, %xmm4
4788; SSE-NEXT:    pand %xmm1, %xmm4
4789; SSE-NEXT:    movdqa %xmm5, %xmm2
4790; SSE-NEXT:    pand %xmm1, %xmm2
4791; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4792; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4793; SSE-NEXT:    pandn %xmm2, %xmm1
4794; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4795; SSE-NEXT:    pand %xmm8, %xmm0
4796; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4797; SSE-NEXT:    pand %xmm8, %xmm15
4798; SSE-NEXT:    pand %xmm8, %xmm13
4799; SSE-NEXT:    pand %xmm8, %xmm5
4800; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4801; SSE-NEXT:    movdqa %xmm2, %xmm0
4802; SSE-NEXT:    pandn %xmm2, %xmm8
4803; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4804; SSE-NEXT:    pand %xmm10, %xmm0
4805; SSE-NEXT:    por %xmm7, %xmm0
4806; SSE-NEXT:    movdqa %xmm0, %xmm7
4807; SSE-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15]
4808; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
4809; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]
4810; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[3,0]
4811; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[0,2]
4812; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm7[2,1,2,3,4,5,6,7]
4813; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
4814; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4815; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
4816; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
4817; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
4818; SSE-NEXT:    packuswb %xmm0, %xmm1
4819; SSE-NEXT:    pand %xmm12, %xmm1
4820; SSE-NEXT:    por %xmm3, %xmm1
4821; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4822; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1]
4823; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4824; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3]
4825; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
4826; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7]
4827; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
4828; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
4829; SSE-NEXT:    packuswb %xmm0, %xmm0
4830; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0]
4831; SSE-NEXT:    movdqa %xmm10, %xmm2
4832; SSE-NEXT:    pandn %xmm0, %xmm2
4833; SSE-NEXT:    pand %xmm10, %xmm1
4834; SSE-NEXT:    por %xmm1, %xmm2
4835; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4836; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4837; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
4838; SSE-NEXT:    pand %xmm3, %xmm0
4839; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4840; SSE-NEXT:    movdqa %xmm0, %xmm1
4841; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
4842; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
4843; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0]
4844; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
4845; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
4846; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7]
4847; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7]
4848; SSE-NEXT:    packuswb %xmm0, %xmm0
4849; SSE-NEXT:    movdqa %xmm12, %xmm1
4850; SSE-NEXT:    pandn %xmm0, %xmm1
4851; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
4852; SSE-NEXT:    movdqa %xmm14, %xmm0
4853; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
4854; SSE-NEXT:    punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15]
4855; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[3,1],xmm0[2,0]
4856; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,6,5,6,7]
4857; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
4858; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7]
4859; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
4860; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
4861; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6]
4862; SSE-NEXT:    packuswb %xmm2, %xmm0
4863; SSE-NEXT:    pand %xmm12, %xmm0
4864; SSE-NEXT:    por %xmm1, %xmm0
4865; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4866; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4867; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
4868; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
4869; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7]
4870; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4871; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5]
4872; SSE-NEXT:    packuswb %xmm1, %xmm1
4873; SSE-NEXT:    movdqa %xmm10, %xmm9
4874; SSE-NEXT:    movdqa %xmm10, %xmm14
4875; SSE-NEXT:    pandn %xmm1, %xmm14
4876; SSE-NEXT:    pand %xmm10, %xmm0
4877; SSE-NEXT:    por %xmm0, %xmm14
4878; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4879; SSE-NEXT:    pand %xmm3, %xmm0
4880; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4881; SSE-NEXT:    movdqa %xmm0, %xmm1
4882; SSE-NEXT:    pxor %xmm2, %xmm2
4883; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
4884; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
4885; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0]
4886; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
4887; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
4888; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7]
4889; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7]
4890; SSE-NEXT:    packuswb %xmm0, %xmm0
4891; SSE-NEXT:    movdqa %xmm12, %xmm1
4892; SSE-NEXT:    pandn %xmm0, %xmm1
4893; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
4894; SSE-NEXT:    movdqa %xmm11, %xmm0
4895; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
4896; SSE-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15]
4897; SSE-NEXT:    pxor %xmm10, %xmm10
4898; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[3,1],xmm0[2,0]
4899; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,6,5,6,7]
4900; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
4901; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7]
4902; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
4903; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
4904; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6]
4905; SSE-NEXT:    packuswb %xmm2, %xmm0
4906; SSE-NEXT:    pand %xmm12, %xmm0
4907; SSE-NEXT:    por %xmm1, %xmm0
4908; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4909; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4910; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
4911; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
4912; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7]
4913; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4914; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5]
4915; SSE-NEXT:    packuswb %xmm1, %xmm1
4916; SSE-NEXT:    movdqa %xmm9, %xmm11
4917; SSE-NEXT:    pandn %xmm1, %xmm11
4918; SSE-NEXT:    pand %xmm9, %xmm0
4919; SSE-NEXT:    por %xmm0, %xmm11
4920; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
4921; SSE-NEXT:    pand %xmm3, %xmm0
4922; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4923; SSE-NEXT:    movdqa %xmm0, %xmm1
4924; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15]
4925; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
4926; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0]
4927; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
4928; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
4929; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7]
4930; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7]
4931; SSE-NEXT:    packuswb %xmm0, %xmm0
4932; SSE-NEXT:    movdqa %xmm12, %xmm1
4933; SSE-NEXT:    pandn %xmm0, %xmm1
4934; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4935; SSE-NEXT:    movdqa %xmm4, %xmm0
4936; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
4937; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
4938; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[2,0]
4939; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,7]
4940; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
4941; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7]
4942; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
4943; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
4944; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6]
4945; SSE-NEXT:    packuswb %xmm2, %xmm0
4946; SSE-NEXT:    pand %xmm12, %xmm0
4947; SSE-NEXT:    por %xmm1, %xmm0
4948; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4949; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4950; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
4951; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
4952; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7]
4953; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4954; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5]
4955; SSE-NEXT:    packuswb %xmm1, %xmm2
4956; SSE-NEXT:    movdqa %xmm9, %xmm10
4957; SSE-NEXT:    pandn %xmm2, %xmm10
4958; SSE-NEXT:    pand %xmm9, %xmm0
4959; SSE-NEXT:    por %xmm0, %xmm10
4960; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4961; SSE-NEXT:    movdqa %xmm3, %xmm2
4962; SSE-NEXT:    pand %xmm3, %xmm0
4963; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4964; SSE-NEXT:    por %xmm0, %xmm2
4965; SSE-NEXT:    movdqa %xmm2, %xmm0
4966; SSE-NEXT:    pxor %xmm1, %xmm1
4967; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
4968; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
4969; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,0]
4970; SSE-NEXT:    movaps %xmm2, %xmm4
4971; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4972; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4973; SSE-NEXT:    movdqa %xmm2, %xmm0
4974; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4975; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
4976; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,0]
4977; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
4978; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
4979; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7]
4980; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
4981; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
4982; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6]
4983; SSE-NEXT:    packuswb %xmm2, %xmm0
4984; SSE-NEXT:    pand %xmm12, %xmm0
4985; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,6,5]
4986; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
4987; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7]
4988; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,4,6,7]
4989; SSE-NEXT:    packuswb %xmm2, %xmm2
4990; SSE-NEXT:    pandn %xmm2, %xmm12
4991; SSE-NEXT:    por %xmm12, %xmm0
4992; SSE-NEXT:    movdqa %xmm8, %xmm3
4993; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[3,0]
4994; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0,2]
4995; SSE-NEXT:    pand %xmm9, %xmm0
4996; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7]
4997; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
4998; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,5]
4999; SSE-NEXT:    packuswb %xmm2, %xmm2
5000; SSE-NEXT:    pandn %xmm2, %xmm9
5001; SSE-NEXT:    por %xmm0, %xmm9
5002; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5003; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5004; SSE-NEXT:    movdqa %xmm3, %xmm0
5005; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
5006; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
5007; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,1,1,3]
5008; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm0[1,2]
5009; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2,3,1]
5010; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm3[3,1,2,3,4,5,6,7]
5011; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
5012; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
5013; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7]
5014; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
5015; SSE-NEXT:    packuswb %xmm0, %xmm2
5016; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535]
5017; SSE-NEXT:    movdqa %xmm4, %xmm3
5018; SSE-NEXT:    pandn %xmm2, %xmm3
5019; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5020; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
5021; SSE-NEXT:    pand %xmm12, %xmm8
5022; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
5023; SSE-NEXT:    movdqa %xmm8, %xmm2
5024; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
5025; SSE-NEXT:    movdqa %xmm6, %xmm7
5026; SSE-NEXT:    pandn %xmm2, %xmm7
5027; SSE-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15]
5028; SSE-NEXT:    pand %xmm6, %xmm8
5029; SSE-NEXT:    por %xmm7, %xmm8
5030; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm8[3,1,0,3,4,5,6,7]
5031; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4]
5032; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
5033; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,0,4,5,6,7]
5034; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
5035; SSE-NEXT:    packuswb %xmm2, %xmm2
5036; SSE-NEXT:    pand %xmm4, %xmm2
5037; SSE-NEXT:    por %xmm3, %xmm2
5038; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5039; SSE-NEXT:    # xmm3 = mem[3,1,2,3]
5040; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
5041; SSE-NEXT:    # xmm7 = mem[0,2,2,3]
5042; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7]
5043; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7]
5044; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
5045; SSE-NEXT:    packuswb %xmm0, %xmm7
5046; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,1]
5047; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
5048; SSE-NEXT:    movdqa %xmm15, %xmm0
5049; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
5050; SSE-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3],xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7]
5051; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm15[0,1,1,3]
5052; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[0,3],xmm0[1,2]
5053; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[0,2,3,1]
5054; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm15[3,1,2,3,4,5,6,7]
5055; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
5056; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
5057; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7]
5058; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
5059; SSE-NEXT:    packuswb %xmm0, %xmm3
5060; SSE-NEXT:    movdqa %xmm4, %xmm7
5061; SSE-NEXT:    pandn %xmm3, %xmm7
5062; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5063; SSE-NEXT:    pand %xmm12, %xmm15
5064; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
5065; SSE-NEXT:    movdqa %xmm15, %xmm3
5066; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
5067; SSE-NEXT:    movdqa %xmm6, %xmm8
5068; SSE-NEXT:    pandn %xmm3, %xmm8
5069; SSE-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15]
5070; SSE-NEXT:    pand %xmm6, %xmm15
5071; SSE-NEXT:    por %xmm8, %xmm15
5072; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm15[3,1,0,3,4,5,6,7]
5073; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,4]
5074; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1]
5075; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,0,4,5,6,7]
5076; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm3[0,1,2,3,6,5,6,7]
5077; SSE-NEXT:    packuswb %xmm8, %xmm8
5078; SSE-NEXT:    pand %xmm4, %xmm8
5079; SSE-NEXT:    por %xmm7, %xmm8
5080; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5081; SSE-NEXT:    # xmm3 = mem[3,1,2,3]
5082; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
5083; SSE-NEXT:    # xmm7 = mem[0,2,2,3]
5084; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7]
5085; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7]
5086; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
5087; SSE-NEXT:    packuswb %xmm0, %xmm7
5088; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,1],xmm7[2,1]
5089; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
5090; SSE-NEXT:    movdqa %xmm13, %xmm0
5091; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
5092; SSE-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3],xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
5093; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm13[0,1,1,3]
5094; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[0,3],xmm0[1,2]
5095; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[0,2,3,1]
5096; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm13[3,1,2,3,4,5,6,7]
5097; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
5098; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
5099; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7]
5100; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
5101; SSE-NEXT:    packuswb %xmm0, %xmm3
5102; SSE-NEXT:    movdqa %xmm4, %xmm7
5103; SSE-NEXT:    pandn %xmm3, %xmm7
5104; SSE-NEXT:    movdqa (%rsp), %xmm13 # 16-byte Reload
5105; SSE-NEXT:    pand %xmm12, %xmm13
5106; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
5107; SSE-NEXT:    movdqa %xmm13, %xmm3
5108; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
5109; SSE-NEXT:    movdqa %xmm6, %xmm5
5110; SSE-NEXT:    pandn %xmm3, %xmm5
5111; SSE-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm1[8],xmm13[9],xmm1[9],xmm13[10],xmm1[10],xmm13[11],xmm1[11],xmm13[12],xmm1[12],xmm13[13],xmm1[13],xmm13[14],xmm1[14],xmm13[15],xmm1[15]
5112; SSE-NEXT:    pand %xmm6, %xmm13
5113; SSE-NEXT:    por %xmm5, %xmm13
5114; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm13[3,1,0,3,4,5,6,7]
5115; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,4]
5116; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1]
5117; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,0,4,5,6,7]
5118; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,6,7]
5119; SSE-NEXT:    packuswb %xmm5, %xmm5
5120; SSE-NEXT:    pand %xmm4, %xmm5
5121; SSE-NEXT:    por %xmm7, %xmm5
5122; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5123; SSE-NEXT:    # xmm3 = mem[3,1,2,3]
5124; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
5125; SSE-NEXT:    # xmm7 = mem[0,2,2,3]
5126; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7]
5127; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7]
5128; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
5129; SSE-NEXT:    packuswb %xmm0, %xmm7
5130; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,1]
5131; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
5132; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
5133; SSE-NEXT:    movdqa %xmm7, %xmm0
5134; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
5135; SSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
5136; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[0,1,1,3]
5137; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,3],xmm0[1,2]
5138; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5139; SSE-NEXT:    pand %xmm12, %xmm0
5140; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
5141; SSE-NEXT:    por %xmm0, %xmm12
5142; SSE-NEXT:    movdqa %xmm12, %xmm0
5143; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
5144; SSE-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15]
5145; SSE-NEXT:    pand %xmm6, %xmm12
5146; SSE-NEXT:    pandn %xmm0, %xmm6
5147; SSE-NEXT:    por %xmm12, %xmm6
5148; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm6[3,1,0,3,4,5,6,7]
5149; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
5150; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
5151; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
5152; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
5153; SSE-NEXT:    packuswb %xmm0, %xmm0
5154; SSE-NEXT:    pand %xmm4, %xmm0
5155; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,2,3,1]
5156; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7]
5157; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
5158; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
5159; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[2,1,3,0,4,5,6,7]
5160; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
5161; SSE-NEXT:    packuswb %xmm6, %xmm3
5162; SSE-NEXT:    pandn %xmm3, %xmm4
5163; SSE-NEXT:    por %xmm4, %xmm0
5164; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5165; SSE-NEXT:    # xmm3 = mem[3,1,2,3]
5166; SSE-NEXT:    pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5167; SSE-NEXT:    # xmm4 = mem[0,2,2,3]
5168; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7]
5169; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
5170; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
5171; SSE-NEXT:    packuswb %xmm6, %xmm4
5172; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,1]
5173; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5174; SSE-NEXT:    movaps %xmm3, 16(%rsi)
5175; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5176; SSE-NEXT:    movaps %xmm3, 48(%rsi)
5177; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5178; SSE-NEXT:    movaps %xmm3, (%rsi)
5179; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5180; SSE-NEXT:    movaps %xmm3, 32(%rsi)
5181; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5182; SSE-NEXT:    movaps %xmm3, 16(%rdx)
5183; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5184; SSE-NEXT:    movaps %xmm3, 48(%rdx)
5185; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5186; SSE-NEXT:    movaps %xmm3, (%rdx)
5187; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5188; SSE-NEXT:    movaps %xmm3, 32(%rdx)
5189; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5190; SSE-NEXT:    movaps %xmm1, 16(%rcx)
5191; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5192; SSE-NEXT:    movaps %xmm1, 48(%rcx)
5193; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5194; SSE-NEXT:    movaps %xmm1, (%rcx)
5195; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5196; SSE-NEXT:    movaps %xmm1, 32(%rcx)
5197; SSE-NEXT:    movdqa %xmm9, 16(%r8)
5198; SSE-NEXT:    movdqa %xmm10, 48(%r8)
5199; SSE-NEXT:    movdqa %xmm11, (%r8)
5200; SSE-NEXT:    movdqa %xmm14, 32(%r8)
5201; SSE-NEXT:    movaps %xmm0, 16(%r9)
5202; SSE-NEXT:    movaps %xmm5, 48(%r9)
5203; SSE-NEXT:    movaps %xmm8, (%r9)
5204; SSE-NEXT:    movaps %xmm2, 32(%r9)
5205; SSE-NEXT:    addq $552, %rsp # imm = 0x228
5206; SSE-NEXT:    retq
5207;
5208; AVX-LABEL: load_i8_stride5_vf64:
5209; AVX:       # %bb.0:
5210; AVX-NEXT:    subq $488, %rsp # imm = 0x1E8
5211; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [4,9,14,0,4,9,14,0,4,9,14,0,4,9,14,0]
5212; AVX-NEXT:    vmovdqa (%rdi), %xmm8
5213; AVX-NEXT:    vmovdqa 16(%rdi), %xmm11
5214; AVX-NEXT:    vmovdqa 32(%rdi), %xmm12
5215; AVX-NEXT:    vmovdqa 48(%rdi), %xmm9
5216; AVX-NEXT:    vpshufb %xmm2, %xmm11, %xmm0
5217; AVX-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5218; AVX-NEXT:    vbroadcastss {{.*#+}} xmm3 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15]
5219; AVX-NEXT:    vpshufb %xmm3, %xmm8, %xmm1
5220; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5221; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
5222; AVX-NEXT:    vmovddup {{.*#+}} xmm4 = [128,128,2,7,12,0,0,128,128,128,2,7,12,0,0,128]
5223; AVX-NEXT:    # xmm4 = mem[0,0]
5224; AVX-NEXT:    vpshufb %xmm4, %xmm9, %xmm0
5225; AVX-NEXT:    vmovdqa %xmm9, (%rsp) # 16-byte Spill
5226; AVX-NEXT:    vmovddup {{.*#+}} xmm5 = [8,13,128,128,128,0,0,3,8,13,128,128,128,0,0,3]
5227; AVX-NEXT:    # xmm5 = mem[0,0]
5228; AVX-NEXT:    vpshufb %xmm5, %xmm12, %xmm6
5229; AVX-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5230; AVX-NEXT:    vpor %xmm0, %xmm6, %xmm6
5231; AVX-NEXT:    vmovq {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
5232; AVX-NEXT:    vpblendvb %xmm0, %xmm1, %xmm6, %xmm1
5233; AVX-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5234; AVX-NEXT:    vmovdqa 176(%rdi), %xmm14
5235; AVX-NEXT:    vpshufb %xmm2, %xmm14, %xmm2
5236; AVX-NEXT:    vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5237; AVX-NEXT:    vmovdqa 160(%rdi), %xmm13
5238; AVX-NEXT:    vpshufb %xmm3, %xmm13, %xmm3
5239; AVX-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5240; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
5241; AVX-NEXT:    vmovdqa 208(%rdi), %xmm10
5242; AVX-NEXT:    vpshufb %xmm4, %xmm10, %xmm3
5243; AVX-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5244; AVX-NEXT:    vmovdqa 192(%rdi), %xmm1
5245; AVX-NEXT:    vpshufb %xmm5, %xmm1, %xmm4
5246; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5247; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
5248; AVX-NEXT:    vpblendvb %xmm0, %xmm2, %xmm3, %xmm2
5249; AVX-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5250; AVX-NEXT:    vmovq {{.*#+}} xmm3 = [1,6,11,128,128,128,128,0,0,0,0,0,0,0,0,0]
5251; AVX-NEXT:    vpshufb %xmm3, %xmm8, %xmm4
5252; AVX-NEXT:    vmovq {{.*#+}} xmm5 = [128,128,128,0,5,10,15,0,0,0,0,0,0,0,0,0]
5253; AVX-NEXT:    vpshufb %xmm5, %xmm11, %xmm6
5254; AVX-NEXT:    vpor %xmm4, %xmm6, %xmm4
5255; AVX-NEXT:    vmovddup {{.*#+}} xmm7 = [128,128,3,8,13,0,0,128,128,128,3,8,13,0,0,128]
5256; AVX-NEXT:    # xmm7 = mem[0,0]
5257; AVX-NEXT:    vpshufb %xmm7, %xmm9, %xmm6
5258; AVX-NEXT:    vmovddup {{.*#+}} xmm8 = [9,14,128,128,128,0,0,4,9,14,128,128,128,0,0,4]
5259; AVX-NEXT:    # xmm8 = mem[0,0]
5260; AVX-NEXT:    vpshufb %xmm8, %xmm12, %xmm9
5261; AVX-NEXT:    vpor %xmm6, %xmm9, %xmm6
5262; AVX-NEXT:    vpblendvb %xmm0, %xmm4, %xmm6, %xmm2
5263; AVX-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5264; AVX-NEXT:    vpshufb %xmm3, %xmm13, %xmm3
5265; AVX-NEXT:    vpshufb %xmm5, %xmm14, %xmm4
5266; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
5267; AVX-NEXT:    vmovddup {{.*#+}} xmm11 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11]
5268; AVX-NEXT:    # xmm11 = mem[0,0]
5269; AVX-NEXT:    vpshufb %xmm7, %xmm10, %xmm4
5270; AVX-NEXT:    vpshufb %xmm8, %xmm1, %xmm5
5271; AVX-NEXT:    vpor %xmm4, %xmm5, %xmm4
5272; AVX-NEXT:    vmovdqa 144(%rdi), %xmm8
5273; AVX-NEXT:    vpblendvb %xmm0, %xmm3, %xmm4, %xmm0
5274; AVX-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5275; AVX-NEXT:    vpshufb %xmm11, %xmm8, %xmm0
5276; AVX-NEXT:    vmovddup {{.*#+}} xmm7 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128]
5277; AVX-NEXT:    # xmm7 = mem[0,0]
5278; AVX-NEXT:    vmovdqa 128(%rdi), %xmm13
5279; AVX-NEXT:    vpshufb %xmm7, %xmm13, %xmm3
5280; AVX-NEXT:    vpor %xmm0, %xmm3, %xmm0
5281; AVX-NEXT:    vmovddup {{.*#+}} xmm5 = [8,13,0,0,128,128,128,3,8,13,0,0,128,128,128,3]
5282; AVX-NEXT:    # xmm5 = mem[0,0]
5283; AVX-NEXT:    vmovdqa 112(%rdi), %xmm1
5284; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5285; AVX-NEXT:    vpshufb %xmm5, %xmm1, %xmm3
5286; AVX-NEXT:    vmovddup {{.*#+}} xmm6 = [128,128,0,0,4,9,14,128,128,128,0,0,4,9,14,128]
5287; AVX-NEXT:    # xmm6 = mem[0,0]
5288; AVX-NEXT:    vmovdqa 96(%rdi), %xmm1
5289; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5290; AVX-NEXT:    vpshufb %xmm6, %xmm1, %xmm12
5291; AVX-NEXT:    vpor %xmm3, %xmm12, %xmm3
5292; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
5293; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm4[0,1],xmm3[2,3,4,5,6,7]
5294; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,5,10,15,128,128,128,128,128,128,u,u,u,u,u,u]
5295; AVX-NEXT:    vmovdqa 80(%rdi), %xmm14
5296; AVX-NEXT:    vpshufb %xmm3, %xmm14, %xmm15
5297; AVX-NEXT:    vpor %xmm15, %xmm12, %xmm12
5298; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm12[0,1,2,3,4],xmm0[5,6,7]
5299; AVX-NEXT:    vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
5300; AVX-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload
5301; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [0,1,6,11,0,1,6,11,0,1,6,11,0,1,6,11]
5302; AVX-NEXT:    vmovdqa 64(%rdi), %xmm1
5303; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5304; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm9
5305; AVX-NEXT:    vandnps %ymm9, %ymm12, %ymm9
5306; AVX-NEXT:    vorps %ymm0, %ymm9, %ymm0
5307; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm0
5308; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5309; AVX-NEXT:    vmovdqa 304(%rdi), %xmm0
5310; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5311; AVX-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
5312; AVX-NEXT:    vmovdqa 288(%rdi), %xmm1
5313; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5314; AVX-NEXT:    vpshufb %xmm7, %xmm1, %xmm9
5315; AVX-NEXT:    vpor %xmm0, %xmm9, %xmm0
5316; AVX-NEXT:    vmovdqa 272(%rdi), %xmm10
5317; AVX-NEXT:    vpshufb %xmm5, %xmm10, %xmm5
5318; AVX-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5319; AVX-NEXT:    vmovdqa 256(%rdi), %xmm9
5320; AVX-NEXT:    vpshufb %xmm6, %xmm9, %xmm6
5321; AVX-NEXT:    vpor %xmm5, %xmm6, %xmm5
5322; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7]
5323; AVX-NEXT:    vmovdqa 240(%rdi), %xmm1
5324; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5325; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
5326; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
5327; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
5328; AVX-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
5329; AVX-NEXT:    vmovdqa 224(%rdi), %xmm5
5330; AVX-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
5331; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5332; AVX-NEXT:    vandnps %ymm2, %ymm12, %ymm2
5333; AVX-NEXT:    vorps %ymm2, %ymm3, %ymm2
5334; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
5335; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5336; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12]
5337; AVX-NEXT:    # xmm2 = mem[0,0]
5338; AVX-NEXT:    vpshufb %xmm2, %xmm8, %xmm0
5339; AVX-NEXT:    vmovdqa %xmm8, %xmm11
5340; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5341; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128]
5342; AVX-NEXT:    # xmm3 = mem[0,0]
5343; AVX-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5344; AVX-NEXT:    vpshufb %xmm3, %xmm13, %xmm4
5345; AVX-NEXT:    vpor %xmm0, %xmm4, %xmm4
5346; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
5347; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm7[u,u,u],zero,zero,zero,zero,xmm7[4,9,14,u,u,u,u,u,u]
5348; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5349; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,0,5,10,15],zero,zero,zero,xmm1[u,u,u,u,u,u]
5350; AVX-NEXT:    vpor %xmm0, %xmm8, %xmm8
5351; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,4,5,6,7,8,9,u,u,u,u,u,u]
5352; AVX-NEXT:    vmovdqa %xmm14, %xmm6
5353; AVX-NEXT:    vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5354; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm14[u,u,u,u,u,u]
5355; AVX-NEXT:    vpor %xmm14, %xmm8, %xmm8
5356; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3,4],xmm4[5,6,7]
5357; AVX-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
5358; AVX-NEXT:    vbroadcastss {{.*#+}} xmm14 = [0,2,7,12,0,2,7,12,0,2,7,12,0,2,7,12]
5359; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5360; AVX-NEXT:    vpshufb %xmm14, %xmm0, %xmm15
5361; AVX-NEXT:    vandnps %ymm15, %ymm12, %ymm15
5362; AVX-NEXT:    vorps %ymm15, %ymm8, %ymm8
5363; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm8, %ymm4
5364; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5365; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5366; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
5367; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5368; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm3
5369; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
5370; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm10[u,u,u],zero,zero,zero,zero,xmm10[4,9,14,u,u,u,u,u,u]
5371; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,0,5,10,15],zero,zero,zero,xmm9[u,u,u,u,u,u]
5372; AVX-NEXT:    vmovdqa %xmm9, %xmm8
5373; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5374; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
5375; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u]
5376; AVX-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
5377; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5378; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm10[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm10[u,u,u,u,u,u]
5379; AVX-NEXT:    vpor %xmm0, %xmm3, %xmm0
5380; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
5381; AVX-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload
5382; AVX-NEXT:    vpshufb %xmm14, %xmm5, %xmm3
5383; AVX-NEXT:    vandnps %ymm3, %ymm12, %ymm3
5384; AVX-NEXT:    vorps %ymm3, %ymm2, %ymm2
5385; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
5386; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5387; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13]
5388; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
5389; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
5390; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,1,6,11],zero,zero,zero,zero,xmm1[u,u,u,u,u,u]
5391; AVX-NEXT:    vmovddup {{.*#+}} xmm4 = [10,15,0,128,128,128,0,5,10,15,0,128,128,128,0,5]
5392; AVX-NEXT:    # xmm4 = mem[0,0]
5393; AVX-NEXT:    vpshufb %xmm4, %xmm7, %xmm5
5394; AVX-NEXT:    vpor %xmm3, %xmm5, %xmm3
5395; AVX-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
5396; AVX-NEXT:    vmovdqa {{.*#+}} xmm11 = [2,7,12,128,128,128,128,128,128,128,u,u,u,u,u,u]
5397; AVX-NEXT:    vpshufb %xmm11, %xmm6, %xmm14
5398; AVX-NEXT:    vpor %xmm3, %xmm14, %xmm3
5399; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm0[5,6,7]
5400; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5401; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u]
5402; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = [128,128,4,9,14,0,128,128,128,128,4,9,14,0,128,128]
5403; AVX-NEXT:    # xmm3 = mem[0,0]
5404; AVX-NEXT:    vmovdqa (%rsp), %xmm7 # 16-byte Reload
5405; AVX-NEXT:    vpshufb %xmm3, %xmm7, %xmm12
5406; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3,4,5,6,7]
5407; AVX-NEXT:    vmovq {{.*#+}} xmm14 = [2,7,12,128,128,128,0,0,0,0,0,0,0,0,0,0]
5408; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5409; AVX-NEXT:    vpshufb %xmm14, %xmm15, %xmm0
5410; AVX-NEXT:    vmovddup {{.*#+}} xmm9 = [10,15,128,128,128,0,0,5,10,15,128,128,128,0,0,5]
5411; AVX-NEXT:    # xmm9 = mem[0,0]
5412; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5413; AVX-NEXT:    vpshufb %xmm9, %xmm5, %xmm13
5414; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm13[3,4,5,6,7]
5415; AVX-NEXT:    vpor %xmm0, %xmm12, %xmm0
5416; AVX-NEXT:    vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
5417; AVX-NEXT:    vandps %ymm0, %ymm12, %ymm0
5418; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5419; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13]
5420; AVX-NEXT:    vandnps %ymm13, %ymm12, %ymm13
5421; AVX-NEXT:    vorps %ymm0, %ymm13, %ymm0
5422; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
5423; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5424; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5425; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13]
5426; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5427; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
5428; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
5429; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,1,6,11],zero,zero,zero,zero,xmm8[u,u,u,u,u,u]
5430; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5431; AVX-NEXT:    vpshufb %xmm4, %xmm13, %xmm4
5432; AVX-NEXT:    vpor %xmm1, %xmm4, %xmm1
5433; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,4,5,6,7,8,9,u,u,u,u,u,u]
5434; AVX-NEXT:    vpshufb %xmm11, %xmm10, %xmm4
5435; AVX-NEXT:    vpor %xmm4, %xmm1, %xmm1
5436; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
5437; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
5438; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm11[1,6,11,u,u,u,u,u,u,u,u,u,u]
5439; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5440; AVX-NEXT:    vpshufb %xmm3, %xmm8, %xmm3
5441; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3,4,5,6,7]
5442; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5443; AVX-NEXT:    vpshufb %xmm14, %xmm10, %xmm3
5444; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5445; AVX-NEXT:    vpshufb %xmm9, %xmm14, %xmm2
5446; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7]
5447; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
5448; AVX-NEXT:    vandps %ymm1, %ymm12, %ymm1
5449; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5450; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13]
5451; AVX-NEXT:    vandnps %ymm2, %ymm12, %ymm2
5452; AVX-NEXT:    vorps %ymm2, %ymm1, %ymm1
5453; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
5454; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5455; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u]
5456; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm5[u,u,u]
5457; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
5458; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
5459; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u],zero,zero,zero,xmm7[0,5,10,15,u,u,u]
5460; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5,6,7]
5461; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm3
5462; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5463; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[1,6,11,u,u,u,u,u,u,u]
5464; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5465; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,7,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
5466; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
5467; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,4,5,6,7,8,u,u,u,u,u,u,u]
5468; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,8,13,128,128,128,128,128,128,u,u,u,u,u,u,u]
5469; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5470; AVX-NEXT:    vpshufb %xmm2, %xmm15, %xmm1
5471; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
5472; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [0,4,9,14,0,4,9,14,0,4,9,14,0,4,9,14]
5473; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5474; AVX-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
5475; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm4
5476; AVX-NEXT:    vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255]
5477; AVX-NEXT:    vandps %ymm3, %ymm12, %ymm3
5478; AVX-NEXT:    vandnps %ymm4, %ymm12, %ymm4
5479; AVX-NEXT:    vorps %ymm4, %ymm3, %ymm3
5480; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5481; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14]
5482; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5483; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
5484; AVX-NEXT:    vpor %xmm5, %xmm7, %xmm5
5485; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm7
5486; AVX-NEXT:    vpmovsxwq {{.*#+}} xmm0 = [18446744073709551615,255]
5487; AVX-NEXT:    vpblendvb %xmm0, %xmm7, %xmm5, %xmm5
5488; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
5489; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5490; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm11[2,7,12,u,u,u,u,u,u,u,u,u,u]
5491; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm14[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm14[u,u,u]
5492; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3,4,5,6,7]
5493; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
5494; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u],zero,zero,zero,xmm8[0,5,10,15,u,u,u]
5495; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4,5,6,7]
5496; AVX-NEXT:    vpor %xmm3, %xmm5, %xmm3
5497; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u]
5498; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
5499; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u]
5500; AVX-NEXT:    vpor %xmm5, %xmm7, %xmm5
5501; AVX-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u]
5502; AVX-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
5503; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5504; AVX-NEXT:    vpshufb %xmm2, %xmm14, %xmm2
5505; AVX-NEXT:    vpor %xmm2, %xmm5, %xmm2
5506; AVX-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
5507; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
5508; AVX-NEXT:    vandps %ymm3, %ymm12, %ymm2
5509; AVX-NEXT:    vandnps %ymm1, %ymm12, %ymm0
5510; AVX-NEXT:    vorps %ymm0, %ymm2, %ymm0
5511; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5512; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm9[4,9,14]
5513; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5514; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
5515; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
5516; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
5517; AVX-NEXT:    vpmovsxwq {{.*#+}} xmm8 = [18446744073709551615,255]
5518; AVX-NEXT:    vpblendvb %xmm8, %xmm2, %xmm1, %xmm1
5519; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
5520; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5521; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = [0,1,6,11,128,128,128,128,0,1,6,11,128,128,128,128]
5522; AVX-NEXT:    # xmm0 = mem[0,0]
5523; AVX-NEXT:    vpshufb %xmm0, %xmm4, %xmm2
5524; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [0,128,128,128,0,5,10,15,0,128,128,128,0,5,10,15]
5525; AVX-NEXT:    # xmm1 = mem[0,0]
5526; AVX-NEXT:    vpshufb %xmm1, %xmm6, %xmm3
5527; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
5528; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5529; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12,u,u,u,u,u,u,u]
5530; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = [128,0,0,3,8,13,128,128,128,0,0,3,8,13,128,128]
5531; AVX-NEXT:    # xmm3 = mem[0,0]
5532; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5533; AVX-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
5534; AVX-NEXT:    vpor %xmm4, %xmm6, %xmm4
5535; AVX-NEXT:    vpshufb %xmm7, %xmm4, %xmm4
5536; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u]
5537; AVX-NEXT:    vpor %xmm7, %xmm4, %xmm4
5538; AVX-NEXT:    vmovdqa (%rsp), %xmm6 # 16-byte Reload
5539; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11,u,u,u,u]
5540; AVX-NEXT:    vmovq {{.*#+}} xmm7 = [128,128,128,3,8,13,0,0,0,0,0,0,0,0,0,0]
5541; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
5542; AVX-NEXT:    vpshufb %xmm7, %xmm12, %xmm12
5543; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3,4,5],xmm12[6,7]
5544; AVX-NEXT:    vmovddup {{.*#+}} xmm12 = [12,128,128,128,0,0,2,7,12,128,128,128,0,0,2,7]
5545; AVX-NEXT:    # xmm12 = mem[0,0]
5546; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5547; AVX-NEXT:    vpshufb %xmm12, %xmm13, %xmm13
5548; AVX-NEXT:    vmovq {{.*#+}} xmm6 = [4,9,14,128,128,128,0,0,0,0,0,0,0,0,0,0]
5549; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5550; AVX-NEXT:    vpshufb %xmm6, %xmm15, %xmm15
5551; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7]
5552; AVX-NEXT:    vpor %xmm10, %xmm13, %xmm10
5553; AVX-NEXT:    vpblendvb %xmm8, %xmm4, %xmm2, %xmm2
5554; AVX-NEXT:    vbroadcastss {{.*#+}} xmm13 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15]
5555; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5556; AVX-NEXT:    vpshufb %xmm13, %xmm15, %xmm15
5557; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm15, %ymm4
5558; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7]
5559; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm4, %ymm2
5560; AVX-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
5561; AVX-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
5562; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
5563; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5564; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[2,7,12,u,u,u,u,u,u,u]
5565; AVX-NEXT:    vpshufb %xmm3, %xmm11, %xmm3
5566; AVX-NEXT:    vpor %xmm1, %xmm3, %xmm1
5567; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,4,5,6,7,8,u,u,u,u,u,u,u]
5568; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm14[4,9,14],zero,zero,zero,zero,zero,zero,xmm14[u,u,u,u,u,u,u]
5569; AVX-NEXT:    vpor %xmm3, %xmm1, %xmm1
5570; AVX-NEXT:    vpblendvb %xmm8, %xmm1, %xmm0, %xmm0
5571; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5572; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
5573; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5574; AVX-NEXT:    vpshufb %xmm7, %xmm4, %xmm4
5575; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4,5],xmm4[6,7]
5576; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5577; AVX-NEXT:    vpshufb %xmm12, %xmm4, %xmm4
5578; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5579; AVX-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
5580; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7]
5581; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
5582; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5583; AVX-NEXT:    vpshufb %xmm13, %xmm4, %xmm4
5584; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1
5585; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
5586; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
5587; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5588; AVX-NEXT:    vmovaps %ymm1, 32(%rsi)
5589; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5590; AVX-NEXT:    vmovaps %ymm1, (%rsi)
5591; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5592; AVX-NEXT:    vmovaps %ymm1, 32(%rdx)
5593; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5594; AVX-NEXT:    vmovaps %ymm1, (%rdx)
5595; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5596; AVX-NEXT:    vmovaps %ymm1, 32(%rcx)
5597; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5598; AVX-NEXT:    vmovaps %ymm1, (%rcx)
5599; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5600; AVX-NEXT:    vmovaps %ymm1, 32(%r8)
5601; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5602; AVX-NEXT:    vmovaps %ymm1, (%r8)
5603; AVX-NEXT:    vmovaps %ymm0, 32(%r9)
5604; AVX-NEXT:    vmovaps %ymm2, (%r9)
5605; AVX-NEXT:    addq $488, %rsp # imm = 0x1E8
5606; AVX-NEXT:    vzeroupper
5607; AVX-NEXT:    retq
5608;
5609; AVX2-LABEL: load_i8_stride5_vf64:
5610; AVX2:       # %bb.0:
5611; AVX2-NEXT:    subq $136, %rsp
5612; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm2
5613; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm4
5614; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm10
5615; AVX2-NEXT:    vmovdqa 256(%rdi), %ymm9
5616; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
5617; AVX2-NEXT:    vpblendvb %ymm12, %ymm10, %ymm9, %ymm0
5618; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5619; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255]
5620; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
5621; AVX2-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm15
5622; AVX2-NEXT:    vpblendvb %ymm12, %ymm2, %ymm4, %ymm0
5623; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5624; AVX2-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm7
5625; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
5626; AVX2-NEXT:    vpblendvb %ymm5, %ymm9, %ymm10, %ymm0
5627; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5628; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0]
5629; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
5630; AVX2-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
5631; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5632; AVX2-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm0
5633; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5634; AVX2-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm11
5635; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
5636; AVX2-NEXT:    vpblendvb %ymm5, %ymm9, %ymm10, %ymm0
5637; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5638; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255]
5639; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
5640; AVX2-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
5641; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5642; AVX2-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm0
5643; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5644; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5645; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5646; AVX2-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
5647; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5648; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
5649; AVX2-NEXT:    vpblendvb %ymm5, %ymm9, %ymm10, %ymm0
5650; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5651; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0]
5652; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
5653; AVX2-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
5654; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5655; AVX2-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm0
5656; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5657; AVX2-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
5658; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5659; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm13
5660; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm14
5661; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
5662; AVX2-NEXT:    vpblendvb %ymm6, %ymm13, %ymm14, %ymm0
5663; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
5664; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u]
5665; AVX2-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
5666; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u]
5667; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
5668; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm1
5669; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11]
5670; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
5671; AVX2-NEXT:    vpshufb %ymm3, %ymm15, %ymm15
5672; AVX2-NEXT:    vpmovsxwd {{.*#+}} xmm2 = [4294967295,4294967295,4294967295,255]
5673; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm15, %ymm0
5674; AVX2-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
5675; AVX2-NEXT:    vpshufb %ymm3, %ymm7, %ymm7
5676; AVX2-NEXT:    vmovdqa (%rdi), %ymm3
5677; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
5678; AVX2-NEXT:    vpblendvb %ymm6, %ymm3, %ymm1, %ymm0
5679; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm8
5680; AVX2-NEXT:    vpshufb %xmm5, %xmm8, %xmm5
5681; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
5682; AVX2-NEXT:    vpor %xmm5, %xmm0, %xmm0
5683; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm7, %ymm15
5684; AVX2-NEXT:    vpblendvb %ymm12, %ymm13, %ymm14, %ymm0
5685; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,6,11,128,128,128,128,4,9,14,128,128,128,u,u,u]
5686; AVX2-NEXT:    vpshufb %xmm6, %xmm0, %xmm5
5687; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
5688; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u]
5689; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
5690; AVX2-NEXT:    vpor %xmm5, %xmm0, %xmm0
5691; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12,1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12]
5692; AVX2-NEXT:    # ymm5 = mem[0,1,0,1]
5693; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5694; AVX2-NEXT:    vpshufb %ymm5, %ymm8, %ymm8
5695; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm8, %ymm8
5696; AVX2-NEXT:    vpshufb %ymm5, %ymm11, %ymm0
5697; AVX2-NEXT:    vpblendvb %ymm12, %ymm3, %ymm1, %ymm5
5698; AVX2-NEXT:    vpshufb %xmm6, %xmm5, %xmm6
5699; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm5
5700; AVX2-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
5701; AVX2-NEXT:    vpor %xmm6, %xmm4, %xmm4
5702; AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm11
5703; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
5704; AVX2-NEXT:    vpblendvb %ymm7, %ymm14, %ymm13, %ymm0
5705; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
5706; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u]
5707; AVX2-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
5708; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u]
5709; AVX2-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
5710; AVX2-NEXT:    vpor %xmm4, %xmm0, %xmm0
5711; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13,2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13]
5712; AVX2-NEXT:    # ymm4 = mem[0,1,0,1]
5713; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
5714; AVX2-NEXT:    vpshufb %ymm4, %ymm12, %ymm12
5715; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm12, %ymm0
5716; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5717; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5718; AVX2-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
5719; AVX2-NEXT:    vpblendvb %ymm7, %ymm1, %ymm3, %ymm4
5720; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm12
5721; AVX2-NEXT:    vpshufb %xmm5, %xmm12, %xmm5
5722; AVX2-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
5723; AVX2-NEXT:    vpor %xmm5, %xmm4, %xmm4
5724; AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
5725; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5726; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
5727; AVX2-NEXT:    vpblendvb %ymm7, %ymm14, %ymm13, %ymm0
5728; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u]
5729; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
5730; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
5731; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,128,2,7,12,128,128,128,0,5,10,15,u,u,u]
5732; AVX2-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
5733; AVX2-NEXT:    vpor %xmm5, %xmm0, %xmm0
5734; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14,3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14]
5735; AVX2-NEXT:    # ymm5 = mem[0,1,0,1]
5736; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
5737; AVX2-NEXT:    vpshufb %ymm5, %ymm12, %ymm12
5738; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm12, %ymm0
5739; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5740; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5741; AVX2-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
5742; AVX2-NEXT:    vpblendvb %ymm7, %ymm1, %ymm3, %ymm5
5743; AVX2-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
5744; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm5
5745; AVX2-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
5746; AVX2-NEXT:    vpor %xmm4, %xmm5, %xmm4
5747; AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm6
5748; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
5749; AVX2-NEXT:    vpblendvb %ymm4, %ymm10, %ymm9, %ymm0
5750; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5751; AVX2-NEXT:    vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
5752; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
5753; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255]
5754; AVX2-NEXT:    # ymm5 = mem[0,1,0,1]
5755; AVX2-NEXT:    vpblendvb %ymm5, %ymm0, %ymm4, %ymm10
5756; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1]
5757; AVX2-NEXT:    vpblendvb %ymm5, %ymm2, %ymm0, %ymm0
5758; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5759; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11]
5760; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
5761; AVX2-NEXT:    vpblendvb %ymm0, %ymm14, %ymm13, %ymm13
5762; AVX2-NEXT:    vmovdqa 304(%rdi), %xmm2
5763; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm3, %ymm9
5764; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm0
5765; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128]
5766; AVX2-NEXT:    vmovdqa 288(%rdi), %xmm1
5767; AVX2-NEXT:    vpshufb %xmm12, %xmm1, %xmm3
5768; AVX2-NEXT:    vpor %xmm0, %xmm3, %xmm0
5769; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
5770; AVX2-NEXT:    vmovdqu (%rsp), %ymm3 # 32-byte Reload
5771; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15]
5772; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
5773; AVX2-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
5774; AVX2-NEXT:    vmovdqa 144(%rdi), %xmm3
5775; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
5776; AVX2-NEXT:    vmovdqa 128(%rdi), %xmm5
5777; AVX2-NEXT:    vpshufb %xmm12, %xmm5, %xmm12
5778; AVX2-NEXT:    vpor %xmm4, %xmm12, %xmm4
5779; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
5780; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5,6,7],ymm15[8,9,10,11,12],ymm4[13,14,15]
5781; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm4[4,5,6,7]
5782; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5783; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12]
5784; AVX2-NEXT:    vpshufb %xmm12, %xmm2, %xmm7
5785; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm14 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128]
5786; AVX2-NEXT:    vpshufb %xmm14, %xmm1, %xmm15
5787; AVX2-NEXT:    vpor %xmm7, %xmm15, %xmm7
5788; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
5789; AVX2-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15]
5790; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7]
5791; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5792; AVX2-NEXT:    vpshufb %xmm12, %xmm3, %xmm8
5793; AVX2-NEXT:    vpshufb %xmm14, %xmm5, %xmm12
5794; AVX2-NEXT:    vpor %xmm8, %xmm12, %xmm8
5795; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
5796; AVX2-NEXT:    vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15]
5797; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
5798; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [0,128,128,128,128,4,9,14,0,128,128,128,128,4,9,14]
5799; AVX2-NEXT:    vpshufb %xmm12, %xmm2, %xmm11
5800; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm15 = [0,0,5,10,15,128,128,128,0,0,5,10,15,128,128,128]
5801; AVX2-NEXT:    vpshufb %xmm15, %xmm1, %xmm14
5802; AVX2-NEXT:    vpor %xmm11, %xmm14, %xmm11
5803; AVX2-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
5804; AVX2-NEXT:    vpmovsxwq {{.*#+}} ymm14 = [18446744073709551615,18446744073709551615,18446744073709551615,255]
5805; AVX2-NEXT:    vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
5806; AVX2-NEXT:    vpshufb %xmm12, %xmm3, %xmm12
5807; AVX2-NEXT:    vpshufb %xmm15, %xmm5, %xmm15
5808; AVX2-NEXT:    vpor %xmm12, %xmm15, %xmm12
5809; AVX2-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
5810; AVX2-NEXT:    vpblendvb %ymm14, %ymm6, %ymm12, %ymm12
5811; AVX2-NEXT:    vextracti128 $1, %ymm13, %xmm15
5812; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u]
5813; AVX2-NEXT:    vpshufb %xmm6, %xmm15, %xmm15
5814; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u]
5815; AVX2-NEXT:    vpshufb %xmm7, %xmm13, %xmm13
5816; AVX2-NEXT:    vpor %xmm15, %xmm13, %xmm13
5817; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15,4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15]
5818; AVX2-NEXT:    # ymm15 = mem[0,1,0,1]
5819; AVX2-NEXT:    vpshufb %ymm15, %ymm10, %ymm10
5820; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm10[3,4,5,6,7]
5821; AVX2-NEXT:    vmovdqa 288(%rdi), %ymm13
5822; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15]
5823; AVX2-NEXT:    vpshufb %ymm4, %ymm13, %ymm13
5824; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5]
5825; AVX2-NEXT:    vpermd %ymm13, %ymm0, %ymm13
5826; AVX2-NEXT:    vpblendvb %ymm14, %ymm10, %ymm13, %ymm10
5827; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
5828; AVX2-NEXT:    vpshufb %ymm15, %ymm13, %ymm13
5829; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm15
5830; AVX2-NEXT:    vpshufb %xmm6, %xmm15, %xmm6
5831; AVX2-NEXT:    vpshufb %xmm7, %xmm9, %xmm7
5832; AVX2-NEXT:    vpor %xmm6, %xmm7, %xmm6
5833; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7]
5834; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm7
5835; AVX2-NEXT:    vpshufb %ymm4, %ymm7, %ymm4
5836; AVX2-NEXT:    vpermd %ymm4, %ymm0, %ymm0
5837; AVX2-NEXT:    vpblendvb %ymm14, %ymm6, %ymm0, %ymm0
5838; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,3,8,13,0,0,128,128,128,3,8,13]
5839; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
5840; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [0,0,4,9,14,128,128,128,0,0,4,9,14,128,128,128]
5841; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
5842; AVX2-NEXT:    vpor %xmm2, %xmm1, %xmm1
5843; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
5844; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5845; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15]
5846; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5847; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm2
5848; AVX2-NEXT:    vpshufb %xmm6, %xmm5, %xmm3
5849; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
5850; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
5851; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5852; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15]
5853; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
5854; AVX2-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
5855; AVX2-NEXT:    vmovaps %ymm3, 32(%rsi)
5856; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5857; AVX2-NEXT:    vmovaps %ymm3, (%rsi)
5858; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5859; AVX2-NEXT:    vmovaps %ymm3, 32(%rdx)
5860; AVX2-NEXT:    vmovdqa %ymm8, (%rdx)
5861; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
5862; AVX2-NEXT:    vmovdqa %ymm2, (%rcx)
5863; AVX2-NEXT:    vmovdqa %ymm11, 32(%r8)
5864; AVX2-NEXT:    vmovdqa %ymm12, (%r8)
5865; AVX2-NEXT:    vmovdqa %ymm10, 32(%r9)
5866; AVX2-NEXT:    vmovdqa %ymm0, (%r9)
5867; AVX2-NEXT:    addq $136, %rsp
5868; AVX2-NEXT:    vzeroupper
5869; AVX2-NEXT:    retq
5870;
5871; AVX2-FP-LABEL: load_i8_stride5_vf64:
5872; AVX2-FP:       # %bb.0:
5873; AVX2-FP-NEXT:    subq $136, %rsp
5874; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm2
5875; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm4
5876; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm10
5877; AVX2-FP-NEXT:    vmovdqa 256(%rdi), %ymm9
5878; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
5879; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm10, %ymm9, %ymm0
5880; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5881; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255]
5882; AVX2-FP-NEXT:    # ymm3 = mem[0,1,0,1]
5883; AVX2-FP-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm15
5884; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm2, %ymm4, %ymm0
5885; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5886; AVX2-FP-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm7
5887; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
5888; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm9, %ymm10, %ymm0
5889; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5890; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0]
5891; AVX2-FP-NEXT:    # ymm3 = mem[0,1,0,1]
5892; AVX2-FP-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
5893; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5894; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm0
5895; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5896; AVX2-FP-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm11
5897; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
5898; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm9, %ymm10, %ymm0
5899; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5900; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255]
5901; AVX2-FP-NEXT:    # ymm3 = mem[0,1,0,1]
5902; AVX2-FP-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
5903; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5904; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm0
5905; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5906; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5907; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5908; AVX2-FP-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
5909; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5910; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
5911; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm9, %ymm10, %ymm0
5912; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5913; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0]
5914; AVX2-FP-NEXT:    # ymm3 = mem[0,1,0,1]
5915; AVX2-FP-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
5916; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5917; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm0
5918; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5919; AVX2-FP-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
5920; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5921; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm13
5922; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm14
5923; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
5924; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm13, %ymm14, %ymm0
5925; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm1
5926; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u]
5927; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
5928; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u]
5929; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
5930; AVX2-FP-NEXT:    vpor %xmm1, %xmm0, %xmm1
5931; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11]
5932; AVX2-FP-NEXT:    # ymm3 = mem[0,1,0,1]
5933; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm15, %ymm15
5934; AVX2-FP-NEXT:    vpmovsxwd {{.*#+}} xmm2 = [4294967295,4294967295,4294967295,255]
5935; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm1, %ymm15, %ymm0
5936; AVX2-FP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
5937; AVX2-FP-NEXT:    vpshufb %ymm3, %ymm7, %ymm7
5938; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm3
5939; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm1
5940; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm3, %ymm1, %ymm0
5941; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm8
5942; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm8, %xmm5
5943; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
5944; AVX2-FP-NEXT:    vpor %xmm5, %xmm0, %xmm0
5945; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm7, %ymm15
5946; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm13, %ymm14, %ymm0
5947; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,6,11,128,128,128,128,4,9,14,128,128,128,u,u,u]
5948; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm0, %xmm5
5949; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm0
5950; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u]
5951; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
5952; AVX2-FP-NEXT:    vpor %xmm5, %xmm0, %xmm0
5953; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12,1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12]
5954; AVX2-FP-NEXT:    # ymm5 = mem[0,1,0,1]
5955; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5956; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm8, %ymm8
5957; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm8, %ymm8
5958; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm11, %ymm0
5959; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm3, %ymm1, %ymm5
5960; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm5, %xmm6
5961; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm5
5962; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
5963; AVX2-FP-NEXT:    vpor %xmm6, %xmm4, %xmm4
5964; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm11
5965; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
5966; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm14, %ymm13, %ymm0
5967; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm4
5968; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u]
5969; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
5970; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u]
5971; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
5972; AVX2-FP-NEXT:    vpor %xmm4, %xmm0, %xmm0
5973; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13,2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13]
5974; AVX2-FP-NEXT:    # ymm4 = mem[0,1,0,1]
5975; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
5976; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm12, %ymm12
5977; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm12, %ymm0
5978; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5979; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5980; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
5981; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm1, %ymm3, %ymm4
5982; AVX2-FP-NEXT:    vextracti128 $1, %ymm4, %xmm12
5983; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm12, %xmm5
5984; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
5985; AVX2-FP-NEXT:    vpor %xmm5, %xmm4, %xmm4
5986; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
5987; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5988; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
5989; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm14, %ymm13, %ymm0
5990; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm4 = [3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u]
5991; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
5992; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm0
5993; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,128,2,7,12,128,128,128,0,5,10,15,u,u,u]
5994; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
5995; AVX2-FP-NEXT:    vpor %xmm5, %xmm0, %xmm0
5996; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14,3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14]
5997; AVX2-FP-NEXT:    # ymm5 = mem[0,1,0,1]
5998; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
5999; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm12, %ymm12
6000; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm12, %ymm0
6001; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6002; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6003; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
6004; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm1, %ymm3, %ymm5
6005; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
6006; AVX2-FP-NEXT:    vextracti128 $1, %ymm5, %xmm5
6007; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
6008; AVX2-FP-NEXT:    vpor %xmm4, %xmm5, %xmm4
6009; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm6
6010; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
6011; AVX2-FP-NEXT:    vpblendvb %ymm4, %ymm10, %ymm9, %ymm0
6012; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6013; AVX2-FP-NEXT:    vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
6014; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
6015; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255]
6016; AVX2-FP-NEXT:    # ymm5 = mem[0,1,0,1]
6017; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm0, %ymm4, %ymm10
6018; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1]
6019; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm2, %ymm0, %ymm0
6020; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6021; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11]
6022; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
6023; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm14, %ymm13, %ymm13
6024; AVX2-FP-NEXT:    vmovdqa 304(%rdi), %xmm2
6025; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm3, %ymm9
6026; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm2, %xmm0
6027; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128]
6028; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %xmm1
6029; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm1, %xmm3
6030; AVX2-FP-NEXT:    vpor %xmm0, %xmm3, %xmm0
6031; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6032; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm3 # 32-byte Reload
6033; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15]
6034; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
6035; AVX2-FP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
6036; AVX2-FP-NEXT:    vmovdqa 144(%rdi), %xmm3
6037; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
6038; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %xmm5
6039; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm5, %xmm12
6040; AVX2-FP-NEXT:    vpor %xmm4, %xmm12, %xmm4
6041; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
6042; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5,6,7],ymm15[8,9,10,11,12],ymm4[13,14,15]
6043; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm4[4,5,6,7]
6044; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6045; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12]
6046; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm2, %xmm7
6047; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm14 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128]
6048; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm1, %xmm15
6049; AVX2-FP-NEXT:    vpor %xmm7, %xmm15, %xmm7
6050; AVX2-FP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
6051; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15]
6052; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7]
6053; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6054; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm3, %xmm8
6055; AVX2-FP-NEXT:    vpshufb %xmm14, %xmm5, %xmm12
6056; AVX2-FP-NEXT:    vpor %xmm8, %xmm12, %xmm8
6057; AVX2-FP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
6058; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15]
6059; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
6060; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [0,128,128,128,128,4,9,14,0,128,128,128,128,4,9,14]
6061; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm2, %xmm11
6062; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm15 = [0,0,5,10,15,128,128,128,0,0,5,10,15,128,128,128]
6063; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm1, %xmm14
6064; AVX2-FP-NEXT:    vpor %xmm11, %xmm14, %xmm11
6065; AVX2-FP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
6066; AVX2-FP-NEXT:    vpmovsxwq {{.*#+}} ymm14 = [18446744073709551615,18446744073709551615,18446744073709551615,255]
6067; AVX2-FP-NEXT:    vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
6068; AVX2-FP-NEXT:    vpshufb %xmm12, %xmm3, %xmm12
6069; AVX2-FP-NEXT:    vpshufb %xmm15, %xmm5, %xmm15
6070; AVX2-FP-NEXT:    vpor %xmm12, %xmm15, %xmm12
6071; AVX2-FP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
6072; AVX2-FP-NEXT:    vpblendvb %ymm14, %ymm6, %ymm12, %ymm12
6073; AVX2-FP-NEXT:    vextracti128 $1, %ymm13, %xmm15
6074; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u]
6075; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm15, %xmm15
6076; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u]
6077; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm13, %xmm13
6078; AVX2-FP-NEXT:    vpor %xmm15, %xmm13, %xmm13
6079; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15,4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15]
6080; AVX2-FP-NEXT:    # ymm15 = mem[0,1,0,1]
6081; AVX2-FP-NEXT:    vpshufb %ymm15, %ymm10, %ymm10
6082; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm10[3,4,5,6,7]
6083; AVX2-FP-NEXT:    vmovdqa 288(%rdi), %ymm13
6084; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15]
6085; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm13, %ymm13
6086; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5]
6087; AVX2-FP-NEXT:    vpermd %ymm13, %ymm0, %ymm13
6088; AVX2-FP-NEXT:    vpblendvb %ymm14, %ymm10, %ymm13, %ymm10
6089; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
6090; AVX2-FP-NEXT:    vpshufb %ymm15, %ymm13, %ymm13
6091; AVX2-FP-NEXT:    vextracti128 $1, %ymm9, %xmm15
6092; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm15, %xmm6
6093; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm9, %xmm7
6094; AVX2-FP-NEXT:    vpor %xmm6, %xmm7, %xmm6
6095; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7]
6096; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm7
6097; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm7, %ymm4
6098; AVX2-FP-NEXT:    vpermd %ymm4, %ymm0, %ymm0
6099; AVX2-FP-NEXT:    vpblendvb %ymm14, %ymm6, %ymm0, %ymm0
6100; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,3,8,13,0,0,128,128,128,3,8,13]
6101; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
6102; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [0,0,4,9,14,128,128,128,0,0,4,9,14,128,128,128]
6103; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
6104; AVX2-FP-NEXT:    vpor %xmm2, %xmm1, %xmm1
6105; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
6106; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6107; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15]
6108; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6109; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm3, %xmm2
6110; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm5, %xmm3
6111; AVX2-FP-NEXT:    vpor %xmm2, %xmm3, %xmm2
6112; AVX2-FP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
6113; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6114; AVX2-FP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15]
6115; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
6116; AVX2-FP-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
6117; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%rsi)
6118; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6119; AVX2-FP-NEXT:    vmovaps %ymm3, (%rsi)
6120; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6121; AVX2-FP-NEXT:    vmovaps %ymm3, 32(%rdx)
6122; AVX2-FP-NEXT:    vmovdqa %ymm8, (%rdx)
6123; AVX2-FP-NEXT:    vmovdqa %ymm1, 32(%rcx)
6124; AVX2-FP-NEXT:    vmovdqa %ymm2, (%rcx)
6125; AVX2-FP-NEXT:    vmovdqa %ymm11, 32(%r8)
6126; AVX2-FP-NEXT:    vmovdqa %ymm12, (%r8)
6127; AVX2-FP-NEXT:    vmovdqa %ymm10, 32(%r9)
6128; AVX2-FP-NEXT:    vmovdqa %ymm0, (%r9)
6129; AVX2-FP-NEXT:    addq $136, %rsp
6130; AVX2-FP-NEXT:    vzeroupper
6131; AVX2-FP-NEXT:    retq
6132;
6133; AVX2-FCP-LABEL: load_i8_stride5_vf64:
6134; AVX2-FCP:       # %bb.0:
6135; AVX2-FCP-NEXT:    subq $136, %rsp
6136; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm2
6137; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm4
6138; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm10
6139; AVX2-FCP-NEXT:    vmovdqa 256(%rdi), %ymm9
6140; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
6141; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm10, %ymm9, %ymm0
6142; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6143; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255]
6144; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
6145; AVX2-FCP-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm15
6146; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm2, %ymm4, %ymm0
6147; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6148; AVX2-FCP-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm7
6149; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
6150; AVX2-FCP-NEXT:    vpblendvb %ymm5, %ymm9, %ymm10, %ymm0
6151; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6152; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0]
6153; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
6154; AVX2-FCP-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
6155; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6156; AVX2-FCP-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm0
6157; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6158; AVX2-FCP-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm11
6159; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
6160; AVX2-FCP-NEXT:    vpblendvb %ymm5, %ymm9, %ymm10, %ymm0
6161; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6162; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255]
6163; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
6164; AVX2-FCP-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
6165; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6166; AVX2-FCP-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm0
6167; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6168; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6169; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6170; AVX2-FCP-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
6171; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6172; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
6173; AVX2-FCP-NEXT:    vpblendvb %ymm5, %ymm9, %ymm10, %ymm0
6174; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6175; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0]
6176; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
6177; AVX2-FCP-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
6178; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6179; AVX2-FCP-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm0
6180; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6181; AVX2-FCP-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
6182; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6183; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm13
6184; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm14
6185; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
6186; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm13, %ymm14, %ymm0
6187; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
6188; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u]
6189; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
6190; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u]
6191; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
6192; AVX2-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm1
6193; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11]
6194; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
6195; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm15, %ymm15
6196; AVX2-FCP-NEXT:    vpmovsxwd {{.*#+}} xmm2 = [4294967295,4294967295,4294967295,255]
6197; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm1, %ymm15, %ymm0
6198; AVX2-FCP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
6199; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm7, %ymm7
6200; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm3
6201; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
6202; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm3, %ymm1, %ymm0
6203; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm8
6204; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm8, %xmm5
6205; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
6206; AVX2-FCP-NEXT:    vpor %xmm5, %xmm0, %xmm0
6207; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm7, %ymm15
6208; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm13, %ymm14, %ymm0
6209; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,6,11,128,128,128,128,4,9,14,128,128,128,u,u,u]
6210; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm5
6211; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
6212; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u]
6213; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
6214; AVX2-FCP-NEXT:    vpor %xmm5, %xmm0, %xmm0
6215; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12,1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12]
6216; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
6217; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
6218; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm8, %ymm8
6219; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm8, %ymm8
6220; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm11, %ymm0
6221; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm3, %ymm1, %ymm5
6222; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm6
6223; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
6224; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
6225; AVX2-FCP-NEXT:    vpor %xmm6, %xmm4, %xmm4
6226; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm11
6227; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
6228; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm14, %ymm13, %ymm0
6229; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm4
6230; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u]
6231; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
6232; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u]
6233; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
6234; AVX2-FCP-NEXT:    vpor %xmm4, %xmm0, %xmm0
6235; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13,2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13]
6236; AVX2-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
6237; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
6238; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm12, %ymm12
6239; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm12, %ymm0
6240; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6241; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6242; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
6243; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm1, %ymm3, %ymm4
6244; AVX2-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm12
6245; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm12, %xmm5
6246; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
6247; AVX2-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
6248; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
6249; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6250; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
6251; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm14, %ymm13, %ymm0
6252; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u]
6253; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
6254; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
6255; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,128,2,7,12,128,128,128,0,5,10,15,u,u,u]
6256; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
6257; AVX2-FCP-NEXT:    vpor %xmm5, %xmm0, %xmm0
6258; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14,3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14]
6259; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
6260; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
6261; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm12, %ymm12
6262; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm12, %ymm0
6263; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6264; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6265; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
6266; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm1, %ymm3, %ymm5
6267; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
6268; AVX2-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm5
6269; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
6270; AVX2-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
6271; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm6
6272; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
6273; AVX2-FCP-NEXT:    vpblendvb %ymm4, %ymm10, %ymm9, %ymm0
6274; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6275; AVX2-FCP-NEXT:    vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
6276; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
6277; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255]
6278; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
6279; AVX2-FCP-NEXT:    vpblendvb %ymm5, %ymm0, %ymm4, %ymm10
6280; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1]
6281; AVX2-FCP-NEXT:    vpblendvb %ymm5, %ymm2, %ymm0, %ymm0
6282; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6283; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11]
6284; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
6285; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm14, %ymm13, %ymm13
6286; AVX2-FCP-NEXT:    vmovdqa 304(%rdi), %xmm2
6287; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm3, %ymm9
6288; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm0
6289; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128]
6290; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %xmm1
6291; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm1, %xmm3
6292; AVX2-FCP-NEXT:    vpor %xmm0, %xmm3, %xmm0
6293; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6294; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm3 # 32-byte Reload
6295; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15]
6296; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
6297; AVX2-FCP-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
6298; AVX2-FCP-NEXT:    vmovdqa 144(%rdi), %xmm3
6299; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm3, %xmm4
6300; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %xmm5
6301; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm5, %xmm12
6302; AVX2-FCP-NEXT:    vpor %xmm4, %xmm12, %xmm4
6303; AVX2-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
6304; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5,6,7],ymm15[8,9,10,11,12],ymm4[13,14,15]
6305; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm4[4,5,6,7]
6306; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6307; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12]
6308; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm7
6309; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm14 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128]
6310; AVX2-FCP-NEXT:    vpshufb %xmm14, %xmm1, %xmm15
6311; AVX2-FCP-NEXT:    vpor %xmm7, %xmm15, %xmm7
6312; AVX2-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
6313; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15]
6314; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7]
6315; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6316; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm3, %xmm8
6317; AVX2-FCP-NEXT:    vpshufb %xmm14, %xmm5, %xmm12
6318; AVX2-FCP-NEXT:    vpor %xmm8, %xmm12, %xmm8
6319; AVX2-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
6320; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15]
6321; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
6322; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [0,128,128,128,128,4,9,14,0,128,128,128,128,4,9,14]
6323; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm11
6324; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm15 = [0,0,5,10,15,128,128,128,0,0,5,10,15,128,128,128]
6325; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm1, %xmm14
6326; AVX2-FCP-NEXT:    vpor %xmm11, %xmm14, %xmm11
6327; AVX2-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
6328; AVX2-FCP-NEXT:    vpmovsxwq {{.*#+}} ymm14 = [18446744073709551615,18446744073709551615,18446744073709551615,255]
6329; AVX2-FCP-NEXT:    vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
6330; AVX2-FCP-NEXT:    vpshufb %xmm12, %xmm3, %xmm12
6331; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm5, %xmm15
6332; AVX2-FCP-NEXT:    vpor %xmm12, %xmm15, %xmm12
6333; AVX2-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
6334; AVX2-FCP-NEXT:    vpblendvb %ymm14, %ymm6, %ymm12, %ymm12
6335; AVX2-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm15
6336; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u]
6337; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm15, %xmm15
6338; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u]
6339; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm13, %xmm13
6340; AVX2-FCP-NEXT:    vpor %xmm15, %xmm13, %xmm13
6341; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15,4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15]
6342; AVX2-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
6343; AVX2-FCP-NEXT:    vpshufb %ymm15, %ymm10, %ymm10
6344; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm10[3,4,5,6,7]
6345; AVX2-FCP-NEXT:    vmovdqa 288(%rdi), %ymm13
6346; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15]
6347; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm13, %ymm13
6348; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5]
6349; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm0, %ymm13
6350; AVX2-FCP-NEXT:    vpblendvb %ymm14, %ymm10, %ymm13, %ymm10
6351; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
6352; AVX2-FCP-NEXT:    vpshufb %ymm15, %ymm13, %ymm13
6353; AVX2-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm15
6354; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm15, %xmm6
6355; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm7
6356; AVX2-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
6357; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7]
6358; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm7
6359; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm7, %ymm4
6360; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm0, %ymm0
6361; AVX2-FCP-NEXT:    vpblendvb %ymm14, %ymm6, %ymm0, %ymm0
6362; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,3,8,13,0,0,128,128,128,3,8,13]
6363; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
6364; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [0,0,4,9,14,128,128,128,0,0,4,9,14,128,128,128]
6365; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
6366; AVX2-FCP-NEXT:    vpor %xmm2, %xmm1, %xmm1
6367; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
6368; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6369; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15]
6370; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6371; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm3, %xmm2
6372; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm3
6373; AVX2-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
6374; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
6375; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6376; AVX2-FCP-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15]
6377; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
6378; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm3 # 32-byte Reload
6379; AVX2-FCP-NEXT:    vmovaps %ymm3, 32(%rsi)
6380; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6381; AVX2-FCP-NEXT:    vmovaps %ymm3, (%rsi)
6382; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6383; AVX2-FCP-NEXT:    vmovaps %ymm3, 32(%rdx)
6384; AVX2-FCP-NEXT:    vmovdqa %ymm8, (%rdx)
6385; AVX2-FCP-NEXT:    vmovdqa %ymm1, 32(%rcx)
6386; AVX2-FCP-NEXT:    vmovdqa %ymm2, (%rcx)
6387; AVX2-FCP-NEXT:    vmovdqa %ymm11, 32(%r8)
6388; AVX2-FCP-NEXT:    vmovdqa %ymm12, (%r8)
6389; AVX2-FCP-NEXT:    vmovdqa %ymm10, 32(%r9)
6390; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%r9)
6391; AVX2-FCP-NEXT:    addq $136, %rsp
6392; AVX2-FCP-NEXT:    vzeroupper
6393; AVX2-FCP-NEXT:    retq
6394;
6395; AVX512-LABEL: load_i8_stride5_vf64:
6396; AVX512:       # %bb.0:
6397; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
6398; AVX512-NEXT:    vmovdqa64 (%rdi), %ymm24
6399; AVX512-NEXT:    vmovdqa64 32(%rdi), %ymm25
6400; AVX512-NEXT:    vmovdqa64 64(%rdi), %ymm22
6401; AVX512-NEXT:    vmovdqa64 96(%rdi), %ymm23
6402; AVX512-NEXT:    vmovdqa %ymm5, %ymm4
6403; AVX512-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23))
6404; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1]
6405; AVX512-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4))
6406; AVX512-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128]
6407; AVX512-NEXT:    vpshufb %ymm6, %ymm7, %ymm7
6408; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
6409; AVX512-NEXT:    vmovdqa %ymm4, %ymm8
6410; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25))
6411; AVX512-NEXT:    vextracti128 $1, %ymm8, %xmm9
6412; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u]
6413; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u]
6414; AVX512-NEXT:    vpor %xmm9, %xmm8, %xmm10
6415; AVX512-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
6416; AVX512-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7
6417; AVX512-NEXT:    vmovdqa64 192(%rdi), %ymm26
6418; AVX512-NEXT:    vmovdqa 224(%rdi), %ymm8
6419; AVX512-NEXT:    vmovdqa %ymm4, %ymm11
6420; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26))
6421; AVX512-NEXT:    vmovdqa 208(%rdi), %xmm9
6422; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9))
6423; AVX512-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
6424; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm12
6425; AVX512-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u]
6426; AVX512-NEXT:    vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5]
6427; AVX512-NEXT:    vpermd %ymm12, %ymm17, %ymm15
6428; AVX512-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
6429; AVX512-NEXT:    vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11
6430; AVX512-NEXT:    vmovdqa 144(%rdi), %xmm12
6431; AVX512-NEXT:    vpshufb %xmm6, %xmm12, %xmm6
6432; AVX512-NEXT:    vmovdqa 128(%rdi), %xmm13
6433; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
6434; AVX512-NEXT:    vpor %xmm6, %xmm11, %xmm6
6435; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
6436; AVX512-NEXT:    vinserti64x4 $1, %ymm15, %zmm6, %zmm6
6437; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
6438; AVX512-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10))
6439; AVX512-NEXT:    vmovdqa 256(%rdi), %ymm14
6440; AVX512-NEXT:    vmovdqa 288(%rdi), %ymm11
6441; AVX512-NEXT:    vmovdqa %ymm5, %ymm10
6442; AVX512-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14))
6443; AVX512-NEXT:    vextracti128 $1, %ymm10, %xmm0
6444; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11]
6445; AVX512-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero
6446; AVX512-NEXT:    vpor %xmm0, %xmm10, %xmm0
6447; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6448; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7]
6449; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm6, %zmm19
6450; AVX512-NEXT:    vmovdqa %ymm4, %ymm0
6451; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11))
6452; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm6
6453; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12]
6454; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero
6455; AVX512-NEXT:    vpor %xmm6, %xmm0, %xmm0
6456; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6457; AVX512-NEXT:    vmovdqa %ymm5, %ymm6
6458; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26))
6459; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9))
6460; AVX512-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
6461; AVX512-NEXT:    vmovdqa 160(%rdi), %xmm15
6462; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u]
6463; AVX512-NEXT:    vmovdqa 176(%rdi), %xmm6
6464; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u]
6465; AVX512-NEXT:    vpor %xmm1, %xmm3, %xmm1
6466; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10
6467; AVX512-NEXT:    vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
6468; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0))
6469; AVX512-NEXT:    vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
6470; AVX512-NEXT:    vmovdqa %ymm10, %ymm0
6471; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22))
6472; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
6473; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0))
6474; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128]
6475; AVX512-NEXT:    vpshufb %ymm0, %ymm3, %ymm3
6476; AVX512-NEXT:    vmovdqa %ymm5, %ymm2
6477; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25))
6478; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u]
6479; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
6480; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u]
6481; AVX512-NEXT:    vpor %xmm7, %xmm2, %xmm2
6482; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3
6483; AVX512-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
6484; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
6485; AVX512-NEXT:    vpor %xmm0, %xmm3, %xmm0
6486; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6487; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2))
6488; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm18
6489; AVX512-NEXT:    vmovdqa %ymm5, %ymm0
6490; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11))
6491; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero
6492; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
6493; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13]
6494; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
6495; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6496; AVX512-NEXT:    vmovdqa %ymm4, %ymm1
6497; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8))
6498; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9))
6499; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
6500; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u]
6501; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
6502; AVX512-NEXT:    vpor %xmm2, %xmm3, %xmm2
6503; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
6504; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6505; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0))
6506; AVX512-NEXT:    vmovdqa %ymm4, %ymm0
6507; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22))
6508; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
6509; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
6510; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128]
6511; AVX512-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
6512; AVX512-NEXT:    vmovdqa %ymm10, %ymm3
6513; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24))
6514; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm7
6515; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
6516; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u]
6517; AVX512-NEXT:    vpor %xmm7, %xmm3, %xmm3
6518; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2
6519; AVX512-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
6520; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
6521; AVX512-NEXT:    vpor %xmm0, %xmm2, %xmm0
6522; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6523; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3))
6524; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm20
6525; AVX512-NEXT:    vmovdqa %ymm10, %ymm0
6526; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14))
6527; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
6528; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14]
6529; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero
6530; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
6531; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6532; AVX512-NEXT:    vmovdqa %ymm5, %ymm1
6533; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8))
6534; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9))
6535; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
6536; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u]
6537; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
6538; AVX512-NEXT:    vpor %xmm2, %xmm3, %xmm2
6539; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
6540; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6541; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0))
6542; AVX512-NEXT:    vmovdqa %ymm5, %ymm0
6543; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22))
6544; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
6545; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
6546; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
6547; AVX512-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
6548; AVX512-NEXT:    vmovdqa %ymm4, %ymm3
6549; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24))
6550; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u]
6551; AVX512-NEXT:    vextracti128 $1, %ymm3, %xmm3
6552; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u]
6553; AVX512-NEXT:    vpor %xmm7, %xmm3, %xmm3
6554; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2
6555; AVX512-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
6556; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
6557; AVX512-NEXT:    vpor %xmm0, %xmm2, %xmm0
6558; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6559; AVX512-NEXT:    vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
6560; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3))
6561; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
6562; AVX512-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14))
6563; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero
6564; AVX512-NEXT:    vextracti128 $1, %ymm11, %xmm3
6565; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
6566; AVX512-NEXT:    vpor %xmm1, %xmm3, %xmm1
6567; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
6568; AVX512-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26))
6569; AVX512-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9))
6570; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
6571; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u]
6572; AVX512-NEXT:    vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
6573; AVX512-NEXT:    vpor %xmm6, %xmm7, %xmm6
6574; AVX512-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7]
6575; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
6576; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1))
6577; AVX512-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24))
6578; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm1
6579; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u]
6580; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u]
6581; AVX512-NEXT:    vpor %xmm1, %xmm5, %xmm1
6582; AVX512-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23))
6583; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
6584; AVX512-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4))
6585; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
6586; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7]
6587; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm4
6588; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
6589; AVX512-NEXT:    vpermd %ymm4, %ymm17, %ymm4
6590; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1))
6591; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm1
6592; AVX512-NEXT:    vmovdqa64 %zmm19, (%rsi)
6593; AVX512-NEXT:    vmovdqa64 %zmm18, (%rdx)
6594; AVX512-NEXT:    vmovdqa64 %zmm20, (%rcx)
6595; AVX512-NEXT:    vmovdqa64 %zmm0, (%r8)
6596; AVX512-NEXT:    vmovdqa64 %zmm1, (%r9)
6597; AVX512-NEXT:    vzeroupper
6598; AVX512-NEXT:    retq
6599;
6600; AVX512-FCP-LABEL: load_i8_stride5_vf64:
6601; AVX512-FCP:       # %bb.0:
6602; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
6603; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %ymm24
6604; AVX512-FCP-NEXT:    vmovdqa64 32(%rdi), %ymm25
6605; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm22
6606; AVX512-FCP-NEXT:    vmovdqa64 96(%rdi), %ymm23
6607; AVX512-FCP-NEXT:    vmovdqa %ymm5, %ymm4
6608; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23))
6609; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1]
6610; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4))
6611; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128]
6612; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm7, %ymm7
6613; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
6614; AVX512-FCP-NEXT:    vmovdqa %ymm4, %ymm8
6615; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25))
6616; AVX512-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
6617; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u]
6618; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u]
6619; AVX512-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm10
6620; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
6621; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7
6622; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm26
6623; AVX512-FCP-NEXT:    vmovdqa 224(%rdi), %ymm8
6624; AVX512-FCP-NEXT:    vmovdqa %ymm4, %ymm11
6625; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26))
6626; AVX512-FCP-NEXT:    vmovdqa 208(%rdi), %xmm9
6627; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9))
6628; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
6629; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %ymm12
6630; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u]
6631; AVX512-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5]
6632; AVX512-FCP-NEXT:    vpermd %ymm12, %ymm17, %ymm15
6633; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
6634; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11
6635; AVX512-FCP-NEXT:    vmovdqa 144(%rdi), %xmm12
6636; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm12, %xmm6
6637; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %xmm13
6638; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
6639; AVX512-FCP-NEXT:    vpor %xmm6, %xmm11, %xmm6
6640; AVX512-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
6641; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm6, %zmm6
6642; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
6643; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10))
6644; AVX512-FCP-NEXT:    vmovdqa 256(%rdi), %ymm14
6645; AVX512-FCP-NEXT:    vmovdqa 288(%rdi), %ymm11
6646; AVX512-FCP-NEXT:    vmovdqa %ymm5, %ymm10
6647; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14))
6648; AVX512-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm0
6649; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11]
6650; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero
6651; AVX512-FCP-NEXT:    vpor %xmm0, %xmm10, %xmm0
6652; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6653; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7]
6654; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm6, %zmm19
6655; AVX512-FCP-NEXT:    vmovdqa %ymm4, %ymm0
6656; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11))
6657; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm6
6658; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12]
6659; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero
6660; AVX512-FCP-NEXT:    vpor %xmm6, %xmm0, %xmm0
6661; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6662; AVX512-FCP-NEXT:    vmovdqa %ymm5, %ymm6
6663; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26))
6664; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9))
6665; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
6666; AVX512-FCP-NEXT:    vmovdqa 160(%rdi), %xmm15
6667; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u]
6668; AVX512-FCP-NEXT:    vmovdqa 176(%rdi), %xmm6
6669; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u]
6670; AVX512-FCP-NEXT:    vpor %xmm1, %xmm3, %xmm1
6671; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10
6672; AVX512-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
6673; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0))
6674; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
6675; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm0
6676; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22))
6677; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
6678; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0))
6679; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128]
6680; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm3
6681; AVX512-FCP-NEXT:    vmovdqa %ymm5, %ymm2
6682; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25))
6683; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u]
6684; AVX512-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm2
6685; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u]
6686; AVX512-FCP-NEXT:    vpor %xmm7, %xmm2, %xmm2
6687; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3
6688; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
6689; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
6690; AVX512-FCP-NEXT:    vpor %xmm0, %xmm3, %xmm0
6691; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6692; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2))
6693; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm18
6694; AVX512-FCP-NEXT:    vmovdqa %ymm5, %ymm0
6695; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11))
6696; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero
6697; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
6698; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13]
6699; AVX512-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
6700; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6701; AVX512-FCP-NEXT:    vmovdqa %ymm4, %ymm1
6702; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8))
6703; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9))
6704; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
6705; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u]
6706; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
6707; AVX512-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
6708; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
6709; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6710; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0))
6711; AVX512-FCP-NEXT:    vmovdqa %ymm4, %ymm0
6712; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22))
6713; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
6714; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
6715; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128]
6716; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
6717; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm3
6718; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24))
6719; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm7
6720; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
6721; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u]
6722; AVX512-FCP-NEXT:    vpor %xmm7, %xmm3, %xmm3
6723; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2
6724; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
6725; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
6726; AVX512-FCP-NEXT:    vpor %xmm0, %xmm2, %xmm0
6727; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6728; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3))
6729; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm20
6730; AVX512-FCP-NEXT:    vmovdqa %ymm10, %ymm0
6731; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14))
6732; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
6733; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14]
6734; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero
6735; AVX512-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
6736; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6737; AVX512-FCP-NEXT:    vmovdqa %ymm5, %ymm1
6738; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8))
6739; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9))
6740; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
6741; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u]
6742; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
6743; AVX512-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
6744; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
6745; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6746; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0))
6747; AVX512-FCP-NEXT:    vmovdqa %ymm5, %ymm0
6748; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22))
6749; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
6750; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
6751; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
6752; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
6753; AVX512-FCP-NEXT:    vmovdqa %ymm4, %ymm3
6754; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24))
6755; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u]
6756; AVX512-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm3
6757; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u]
6758; AVX512-FCP-NEXT:    vpor %xmm7, %xmm3, %xmm3
6759; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2
6760; AVX512-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
6761; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
6762; AVX512-FCP-NEXT:    vpor %xmm0, %xmm2, %xmm0
6763; AVX512-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6764; AVX512-FCP-NEXT:    vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
6765; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3))
6766; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
6767; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14))
6768; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero
6769; AVX512-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm3
6770; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
6771; AVX512-FCP-NEXT:    vpor %xmm1, %xmm3, %xmm1
6772; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
6773; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26))
6774; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9))
6775; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
6776; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u]
6777; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
6778; AVX512-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
6779; AVX512-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7]
6780; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
6781; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1))
6782; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24))
6783; AVX512-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm1
6784; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u]
6785; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u]
6786; AVX512-FCP-NEXT:    vpor %xmm1, %xmm5, %xmm1
6787; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23))
6788; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
6789; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4))
6790; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
6791; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7]
6792; AVX512-FCP-NEXT:    vmovdqa 128(%rdi), %ymm4
6793; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
6794; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm17, %ymm4
6795; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1))
6796; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm1
6797; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, (%rsi)
6798; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, (%rdx)
6799; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, (%rcx)
6800; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%r8)
6801; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, (%r9)
6802; AVX512-FCP-NEXT:    vzeroupper
6803; AVX512-FCP-NEXT:    retq
6804;
6805; AVX512DQ-LABEL: load_i8_stride5_vf64:
6806; AVX512DQ:       # %bb.0:
6807; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
6808; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %ymm24
6809; AVX512DQ-NEXT:    vmovdqa64 32(%rdi), %ymm25
6810; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %ymm22
6811; AVX512DQ-NEXT:    vmovdqa64 96(%rdi), %ymm23
6812; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm4
6813; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23))
6814; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1]
6815; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4))
6816; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128]
6817; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm7, %ymm7
6818; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
6819; AVX512DQ-NEXT:    vmovdqa %ymm4, %ymm8
6820; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25))
6821; AVX512DQ-NEXT:    vextracti128 $1, %ymm8, %xmm9
6822; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u]
6823; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u]
6824; AVX512DQ-NEXT:    vpor %xmm9, %xmm8, %xmm10
6825; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
6826; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7
6827; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %ymm26
6828; AVX512DQ-NEXT:    vmovdqa 224(%rdi), %ymm8
6829; AVX512DQ-NEXT:    vmovdqa %ymm4, %ymm11
6830; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26))
6831; AVX512DQ-NEXT:    vmovdqa 208(%rdi), %xmm9
6832; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9))
6833; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
6834; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %ymm12
6835; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u]
6836; AVX512DQ-NEXT:    vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5]
6837; AVX512DQ-NEXT:    vpermd %ymm12, %ymm17, %ymm15
6838; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
6839; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11
6840; AVX512DQ-NEXT:    vmovdqa 144(%rdi), %xmm12
6841; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm12, %xmm6
6842; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %xmm13
6843; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
6844; AVX512DQ-NEXT:    vpor %xmm6, %xmm11, %xmm6
6845; AVX512DQ-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
6846; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm15, %zmm6, %zmm6
6847; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
6848; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10))
6849; AVX512DQ-NEXT:    vmovdqa 256(%rdi), %ymm14
6850; AVX512DQ-NEXT:    vmovdqa 288(%rdi), %ymm11
6851; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm10
6852; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14))
6853; AVX512DQ-NEXT:    vextracti128 $1, %ymm10, %xmm0
6854; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11]
6855; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero
6856; AVX512DQ-NEXT:    vpor %xmm0, %xmm10, %xmm0
6857; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6858; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7]
6859; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm6, %zmm19
6860; AVX512DQ-NEXT:    vmovdqa %ymm4, %ymm0
6861; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11))
6862; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm6
6863; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12]
6864; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero
6865; AVX512DQ-NEXT:    vpor %xmm6, %xmm0, %xmm0
6866; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6867; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm6
6868; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26))
6869; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9))
6870; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
6871; AVX512DQ-NEXT:    vmovdqa 160(%rdi), %xmm15
6872; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u]
6873; AVX512DQ-NEXT:    vmovdqa 176(%rdi), %xmm6
6874; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u]
6875; AVX512DQ-NEXT:    vpor %xmm1, %xmm3, %xmm1
6876; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10
6877; AVX512DQ-NEXT:    vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
6878; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0))
6879; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
6880; AVX512DQ-NEXT:    vmovdqa %ymm10, %ymm0
6881; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22))
6882; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
6883; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0))
6884; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128]
6885; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm3
6886; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm2
6887; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25))
6888; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u]
6889; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
6890; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u]
6891; AVX512DQ-NEXT:    vpor %xmm7, %xmm2, %xmm2
6892; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3
6893; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
6894; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
6895; AVX512DQ-NEXT:    vpor %xmm0, %xmm3, %xmm0
6896; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6897; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2))
6898; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm18
6899; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm0
6900; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11))
6901; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero
6902; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
6903; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13]
6904; AVX512DQ-NEXT:    vpor %xmm1, %xmm0, %xmm0
6905; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6906; AVX512DQ-NEXT:    vmovdqa %ymm4, %ymm1
6907; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8))
6908; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9))
6909; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
6910; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u]
6911; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
6912; AVX512DQ-NEXT:    vpor %xmm2, %xmm3, %xmm2
6913; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
6914; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6915; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0))
6916; AVX512DQ-NEXT:    vmovdqa %ymm4, %ymm0
6917; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22))
6918; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
6919; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
6920; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128]
6921; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
6922; AVX512DQ-NEXT:    vmovdqa %ymm10, %ymm3
6923; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24))
6924; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm7
6925; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
6926; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u]
6927; AVX512DQ-NEXT:    vpor %xmm7, %xmm3, %xmm3
6928; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2
6929; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
6930; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
6931; AVX512DQ-NEXT:    vpor %xmm0, %xmm2, %xmm0
6932; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6933; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3))
6934; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm20
6935; AVX512DQ-NEXT:    vmovdqa %ymm10, %ymm0
6936; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14))
6937; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
6938; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14]
6939; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero
6940; AVX512DQ-NEXT:    vpor %xmm1, %xmm0, %xmm0
6941; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6942; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm1
6943; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8))
6944; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9))
6945; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
6946; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u]
6947; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
6948; AVX512DQ-NEXT:    vpor %xmm2, %xmm3, %xmm2
6949; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
6950; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6951; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0))
6952; AVX512DQ-NEXT:    vmovdqa %ymm5, %ymm0
6953; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22))
6954; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
6955; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
6956; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
6957; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
6958; AVX512DQ-NEXT:    vmovdqa %ymm4, %ymm3
6959; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24))
6960; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u]
6961; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm3
6962; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u]
6963; AVX512DQ-NEXT:    vpor %xmm7, %xmm3, %xmm3
6964; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2
6965; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
6966; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
6967; AVX512DQ-NEXT:    vpor %xmm0, %xmm2, %xmm0
6968; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
6969; AVX512DQ-NEXT:    vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
6970; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3))
6971; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
6972; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14))
6973; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero
6974; AVX512DQ-NEXT:    vextracti128 $1, %ymm11, %xmm3
6975; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
6976; AVX512DQ-NEXT:    vpor %xmm1, %xmm3, %xmm1
6977; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
6978; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26))
6979; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9))
6980; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
6981; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u]
6982; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
6983; AVX512DQ-NEXT:    vpor %xmm6, %xmm7, %xmm6
6984; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7]
6985; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
6986; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1))
6987; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24))
6988; AVX512DQ-NEXT:    vextracti128 $1, %ymm5, %xmm1
6989; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u]
6990; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u]
6991; AVX512DQ-NEXT:    vpor %xmm1, %xmm5, %xmm1
6992; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23))
6993; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
6994; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4))
6995; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
6996; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7]
6997; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm4
6998; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
6999; AVX512DQ-NEXT:    vpermd %ymm4, %ymm17, %ymm4
7000; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1))
7001; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm1
7002; AVX512DQ-NEXT:    vmovdqa64 %zmm19, (%rsi)
7003; AVX512DQ-NEXT:    vmovdqa64 %zmm18, (%rdx)
7004; AVX512DQ-NEXT:    vmovdqa64 %zmm20, (%rcx)
7005; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%r8)
7006; AVX512DQ-NEXT:    vmovdqa64 %zmm1, (%r9)
7007; AVX512DQ-NEXT:    vzeroupper
7008; AVX512DQ-NEXT:    retq
7009;
7010; AVX512DQ-FCP-LABEL: load_i8_stride5_vf64:
7011; AVX512DQ-FCP:       # %bb.0:
7012; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
7013; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %ymm24
7014; AVX512DQ-FCP-NEXT:    vmovdqa64 32(%rdi), %ymm25
7015; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %ymm22
7016; AVX512DQ-FCP-NEXT:    vmovdqa64 96(%rdi), %ymm23
7017; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm4
7018; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23))
7019; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1]
7020; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4))
7021; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128]
7022; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm7, %ymm7
7023; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
7024; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, %ymm8
7025; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25))
7026; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm9
7027; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u]
7028; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u]
7029; AVX512DQ-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm10
7030; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
7031; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7
7032; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm26
7033; AVX512DQ-FCP-NEXT:    vmovdqa 224(%rdi), %ymm8
7034; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, %ymm11
7035; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26))
7036; AVX512DQ-FCP-NEXT:    vmovdqa 208(%rdi), %xmm9
7037; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9))
7038; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
7039; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %ymm12
7040; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u]
7041; AVX512DQ-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5]
7042; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm17, %ymm15
7043; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
7044; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11
7045; AVX512DQ-FCP-NEXT:    vmovdqa 144(%rdi), %xmm12
7046; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm12, %xmm6
7047; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %xmm13
7048; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
7049; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm11, %xmm6
7050; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
7051; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm6, %zmm6
7052; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
7053; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10))
7054; AVX512DQ-FCP-NEXT:    vmovdqa 256(%rdi), %ymm14
7055; AVX512DQ-FCP-NEXT:    vmovdqa 288(%rdi), %ymm11
7056; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm10
7057; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14))
7058; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm0
7059; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11]
7060; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero
7061; AVX512DQ-FCP-NEXT:    vpor %xmm0, %xmm10, %xmm0
7062; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
7063; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7]
7064; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm6, %zmm19
7065; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, %ymm0
7066; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11))
7067; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm6
7068; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12]
7069; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero
7070; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm0, %xmm0
7071; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
7072; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm6
7073; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26))
7074; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9))
7075; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
7076; AVX512DQ-FCP-NEXT:    vmovdqa 160(%rdi), %xmm15
7077; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u]
7078; AVX512DQ-FCP-NEXT:    vmovdqa 176(%rdi), %xmm6
7079; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u]
7080; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm3, %xmm1
7081; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10
7082; AVX512DQ-FCP-NEXT:    vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
7083; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0))
7084; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
7085; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm0
7086; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22))
7087; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
7088; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0))
7089; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128]
7090; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm3
7091; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm2
7092; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25))
7093; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u]
7094; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm2
7095; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u]
7096; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm2, %xmm2
7097; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3
7098; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
7099; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
7100; AVX512DQ-FCP-NEXT:    vpor %xmm0, %xmm3, %xmm0
7101; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
7102; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2))
7103; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm18
7104; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm0
7105; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11))
7106; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero
7107; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm0
7108; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13]
7109; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
7110; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
7111; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, %ymm1
7112; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8))
7113; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9))
7114; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
7115; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u]
7116; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
7117; AVX512DQ-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
7118; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
7119; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
7120; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0))
7121; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, %ymm0
7122; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22))
7123; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
7124; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
7125; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128]
7126; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
7127; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm3
7128; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24))
7129; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm7
7130; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
7131; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u]
7132; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm3, %xmm3
7133; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2
7134; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
7135; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
7136; AVX512DQ-FCP-NEXT:    vpor %xmm0, %xmm2, %xmm0
7137; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
7138; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3))
7139; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm20
7140; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm0
7141; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14))
7142; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
7143; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14]
7144; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero
7145; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm0, %xmm0
7146; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
7147; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm1
7148; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8))
7149; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9))
7150; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
7151; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u]
7152; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
7153; AVX512DQ-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
7154; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
7155; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
7156; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0))
7157; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm0
7158; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22))
7159; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
7160; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
7161; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
7162; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
7163; AVX512DQ-FCP-NEXT:    vmovdqa %ymm4, %ymm3
7164; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24))
7165; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u]
7166; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm3
7167; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u]
7168; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm3, %xmm3
7169; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2
7170; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
7171; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
7172; AVX512DQ-FCP-NEXT:    vpor %xmm0, %xmm2, %xmm0
7173; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
7174; AVX512DQ-FCP-NEXT:    vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
7175; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3))
7176; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
7177; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14))
7178; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero
7179; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm3
7180; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
7181; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm3, %xmm1
7182; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
7183; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26))
7184; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9))
7185; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
7186; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u]
7187; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
7188; AVX512DQ-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
7189; AVX512DQ-FCP-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7]
7190; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
7191; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1))
7192; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24))
7193; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm1
7194; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u]
7195; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u]
7196; AVX512DQ-FCP-NEXT:    vpor %xmm1, %xmm5, %xmm1
7197; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23))
7198; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
7199; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4))
7200; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
7201; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7]
7202; AVX512DQ-FCP-NEXT:    vmovdqa 128(%rdi), %ymm4
7203; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
7204; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm17, %ymm4
7205; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1))
7206; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm1
7207; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, (%rsi)
7208; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, (%rdx)
7209; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, (%rcx)
7210; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%r8)
7211; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, (%r9)
7212; AVX512DQ-FCP-NEXT:    vzeroupper
7213; AVX512DQ-FCP-NEXT:    retq
7214;
7215; AVX512BW-LABEL: load_i8_stride5_vf64:
7216; AVX512BW:       # %bb.0:
7217; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm3
7218; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm2
7219; AVX512BW-NEXT:    vmovdqa 64(%rdi), %ymm0
7220; AVX512BW-NEXT:    vmovdqa 96(%rdi), %ymm1
7221; AVX512BW-NEXT:    movw $21140, %ax # imm = 0x5294
7222; AVX512BW-NEXT:    kmovd %eax, %k2
7223; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm0, %ymm4 {%k2}
7224; AVX512BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
7225; AVX512BW-NEXT:    movl $1108344832, %eax # imm = 0x42100000
7226; AVX512BW-NEXT:    kmovd %eax, %k1
7227; AVX512BW-NEXT:    vmovdqu8 %ymm5, %ymm4 {%k1}
7228; AVX512BW-NEXT:    movw $19026, %ax # imm = 0x4A52
7229; AVX512BW-NEXT:    kmovd %eax, %k1
7230; AVX512BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm5 {%k1}
7231; AVX512BW-NEXT:    vextracti128 $1, %ymm5, %xmm6
7232; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
7233; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
7234; AVX512BW-NEXT:    vpor %xmm6, %xmm5, %xmm9
7235; AVX512BW-NEXT:    movl $67100672, %eax # imm = 0x3FFE000
7236; AVX512BW-NEXT:    kmovd %eax, %k5
7237; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
7238; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm6
7239; AVX512BW-NEXT:    vmovdqa 224(%rdi), %ymm5
7240; AVX512BW-NEXT:    vpblendmw %ymm6, %ymm5, %ymm4 {%k1}
7241; AVX512BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1]
7242; AVX512BW-NEXT:    movl $4228, %eax # imm = 0x1084
7243; AVX512BW-NEXT:    kmovd %eax, %k3
7244; AVX512BW-NEXT:    vmovdqu8 %ymm7, %ymm4 {%k3}
7245; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
7246; AVX512BW-NEXT:    vmovdqa 160(%rdi), %ymm4
7247; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u]
7248; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5]
7249; AVX512BW-NEXT:    vpermd %ymm8, %ymm19, %ymm8
7250; AVX512BW-NEXT:    movl $127, %eax
7251; AVX512BW-NEXT:    kmovd %eax, %k4
7252; AVX512BW-NEXT:    vmovdqu8 %ymm8, %ymm7 {%k4}
7253; AVX512BW-NEXT:    vmovdqa 144(%rdi), %xmm11
7254; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11]
7255; AVX512BW-NEXT:    vmovdqa 128(%rdi), %xmm12
7256; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
7257; AVX512BW-NEXT:    vpor %xmm8, %xmm10, %xmm8
7258; AVX512BW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
7259; AVX512BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm7
7260; AVX512BW-NEXT:    vmovdqu16 %zmm7, %zmm9 {%k5}
7261; AVX512BW-NEXT:    vextracti64x4 $1, %zmm9, %ymm10
7262; AVX512BW-NEXT:    vmovdqa 256(%rdi), %ymm8
7263; AVX512BW-NEXT:    vmovdqa 288(%rdi), %ymm7
7264; AVX512BW-NEXT:    vpblendmw %ymm8, %ymm7, %ymm13 {%k2}
7265; AVX512BW-NEXT:    vextracti128 $1, %ymm13, %xmm14
7266; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11]
7267; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero
7268; AVX512BW-NEXT:    vpor %xmm14, %xmm13, %xmm13
7269; AVX512BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
7270; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7]
7271; AVX512BW-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm20
7272; AVX512BW-NEXT:    movw $10570, %ax # imm = 0x294A
7273; AVX512BW-NEXT:    kmovd %eax, %k3
7274; AVX512BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm10 {%k3}
7275; AVX512BW-NEXT:    vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
7276; AVX512BW-NEXT:    movl $-2078212096, %eax # imm = 0x84210000
7277; AVX512BW-NEXT:    kmovd %eax, %k6
7278; AVX512BW-NEXT:    vmovdqu8 %ymm13, %ymm10 {%k6}
7279; AVX512BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm13 {%k2}
7280; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u]
7281; AVX512BW-NEXT:    vextracti128 $1, %ymm13, %xmm13
7282; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u]
7283; AVX512BW-NEXT:    vpor %xmm14, %xmm13, %xmm14
7284; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
7285; AVX512BW-NEXT:    vpblendmw %ymm6, %ymm5, %ymm10 {%k2}
7286; AVX512BW-NEXT:    vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
7287; AVX512BW-NEXT:    movl $8456, %eax # imm = 0x2108
7288; AVX512BW-NEXT:    kmovd %eax, %k6
7289; AVX512BW-NEXT:    vmovdqu8 %ymm13, %ymm10 {%k6}
7290; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
7291; AVX512BW-NEXT:    vmovdqa 160(%rdi), %xmm10
7292; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u]
7293; AVX512BW-NEXT:    vmovdqa 176(%rdi), %xmm13
7294; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u]
7295; AVX512BW-NEXT:    vporq %xmm16, %xmm17, %xmm16
7296; AVX512BW-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k4}
7297; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12]
7298; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
7299; AVX512BW-NEXT:    vporq %xmm16, %xmm17, %xmm16
7300; AVX512BW-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
7301; AVX512BW-NEXT:    vinserti64x4 $1, %ymm15, %zmm16, %zmm15
7302; AVX512BW-NEXT:    vmovdqu16 %zmm15, %zmm14 {%k5}
7303; AVX512BW-NEXT:    vextracti64x4 $1, %zmm14, %ymm15
7304; AVX512BW-NEXT:    vpblendmw %ymm7, %ymm8, %ymm16 {%k1}
7305; AVX512BW-NEXT:    vextracti32x4 $1, %ymm16, %xmm17
7306; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12]
7307; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero
7308; AVX512BW-NEXT:    vporq %xmm17, %xmm16, %xmm16
7309; AVX512BW-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
7310; AVX512BW-NEXT:    movl $-524288, %eax # imm = 0xFFF80000
7311; AVX512BW-NEXT:    kmovd %eax, %k4
7312; AVX512BW-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k4}
7313; AVX512BW-NEXT:    vinserti64x4 $1, %ymm15, %zmm14, %zmm14
7314; AVX512BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm15 {%k1}
7315; AVX512BW-NEXT:    vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1]
7316; AVX512BW-NEXT:    movl $138543104, %eax # imm = 0x8420000
7317; AVX512BW-NEXT:    kmovd %eax, %k6
7318; AVX512BW-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k6}
7319; AVX512BW-NEXT:    vpblendmw %ymm3, %ymm2, %ymm16 {%k3}
7320; AVX512BW-NEXT:    vextracti32x4 $1, %ymm16, %xmm17
7321; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u]
7322; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u]
7323; AVX512BW-NEXT:    vporq %xmm17, %xmm16, %xmm16
7324; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
7325; AVX512BW-NEXT:    vpblendmw %ymm5, %ymm6, %ymm15 {%k1}
7326; AVX512BW-NEXT:    vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1]
7327; AVX512BW-NEXT:    movl $16912, %eax # imm = 0x4210
7328; AVX512BW-NEXT:    kmovd %eax, %k6
7329; AVX512BW-NEXT:    vmovdqu8 %ymm17, %ymm15 {%k6}
7330; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
7331; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u]
7332; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
7333; AVX512BW-NEXT:    vporq %xmm17, %xmm18, %xmm4
7334; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7]
7335; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7]
7336; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13]
7337; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
7338; AVX512BW-NEXT:    vporq %xmm15, %xmm17, %xmm15
7339; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
7340; AVX512BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm15, %zmm4
7341; AVX512BW-NEXT:    vmovdqu16 %zmm4, %zmm16 {%k5}
7342; AVX512BW-NEXT:    vextracti64x4 $1, %zmm16, %ymm4
7343; AVX512BW-NEXT:    vpblendmw %ymm7, %ymm8, %ymm15 {%k2}
7344; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero
7345; AVX512BW-NEXT:    vextracti128 $1, %ymm15, %xmm15
7346; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13]
7347; AVX512BW-NEXT:    vporq %xmm17, %xmm15, %xmm15
7348; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
7349; AVX512BW-NEXT:    vmovdqu8 %ymm15, %ymm4 {%k4}
7350; AVX512BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm16, %zmm15
7351; AVX512BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm4 {%k2}
7352; AVX512BW-NEXT:    vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1]
7353; AVX512BW-NEXT:    movl $277086208, %eax # imm = 0x10840000
7354; AVX512BW-NEXT:    kmovd %eax, %k5
7355; AVX512BW-NEXT:    vmovdqu8 %ymm16, %ymm4 {%k5}
7356; AVX512BW-NEXT:    vpblendmw %ymm3, %ymm2, %ymm16 {%k1}
7357; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm17 = xmm16[3,8,13],zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[u,u,u]
7358; AVX512BW-NEXT:    vextracti32x4 $1, %ymm16, %xmm16
7359; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15,u,u,u]
7360; AVX512BW-NEXT:    vporq %xmm17, %xmm16, %xmm16
7361; AVX512BW-NEXT:    movl $33546240, %eax # imm = 0x1FFE000
7362; AVX512BW-NEXT:    kmovd %eax, %k5
7363; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm16 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
7364; AVX512BW-NEXT:    vpblendmw %ymm5, %ymm6, %ymm4 {%k2}
7365; AVX512BW-NEXT:    vpermq {{.*#+}} ymm17 = ymm4[2,3,0,1]
7366; AVX512BW-NEXT:    movl $33825, %eax # imm = 0x8421
7367; AVX512BW-NEXT:    kmovd %eax, %k5
7368; AVX512BW-NEXT:    vmovdqu8 %ymm17, %ymm4 {%k5}
7369; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
7370; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u]
7371; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
7372; AVX512BW-NEXT:    vporq %xmm17, %xmm18, %xmm9
7373; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7]
7374; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
7375; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14]
7376; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
7377; AVX512BW-NEXT:    vpor %xmm9, %xmm11, %xmm9
7378; AVX512BW-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
7379; AVX512BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm9, %zmm4
7380; AVX512BW-NEXT:    movl $33554431, %eax # imm = 0x1FFFFFF
7381; AVX512BW-NEXT:    kmovq %rax, %k5
7382; AVX512BW-NEXT:    vmovdqu8 %zmm16, %zmm4 {%k5}
7383; AVX512BW-NEXT:    vextracti64x4 $1, %zmm4, %ymm9
7384; AVX512BW-NEXT:    vpblendmw %ymm8, %ymm7, %ymm11 {%k3}
7385; AVX512BW-NEXT:    vextracti128 $1, %ymm11, %xmm12
7386; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14]
7387; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero
7388; AVX512BW-NEXT:    vpor %xmm12, %xmm11, %xmm11
7389; AVX512BW-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
7390; AVX512BW-NEXT:    vmovdqu8 %ymm11, %ymm9 {%k4}
7391; AVX512BW-NEXT:    vinserti64x4 $1, %ymm9, %zmm4, %zmm4
7392; AVX512BW-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k2}
7393; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
7394; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
7395; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
7396; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
7397; AVX512BW-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
7398; AVX512BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
7399; AVX512BW-NEXT:    movl $554172416, %eax # imm = 0x21080000
7400; AVX512BW-NEXT:    kmovd %eax, %k2
7401; AVX512BW-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k2}
7402; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
7403; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
7404; AVX512BW-NEXT:    vmovdqu16 %ymm6, %ymm5 {%k3}
7405; AVX512BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1]
7406; AVX512BW-NEXT:    movl $2114, %eax # imm = 0x842
7407; AVX512BW-NEXT:    kmovd %eax, %k2
7408; AVX512BW-NEXT:    vmovdqu8 %ymm1, %ymm5 {%k2}
7409; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
7410; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u]
7411; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
7412; AVX512BW-NEXT:    vpor %xmm2, %xmm3, %xmm2
7413; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
7414; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
7415; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm2
7416; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
7417; AVX512BW-NEXT:    vpermd %ymm2, %ymm19, %ymm2
7418; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
7419; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k5}
7420; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
7421; AVX512BW-NEXT:    vmovdqu16 %ymm8, %ymm7 {%k1}
7422; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero
7423; AVX512BW-NEXT:    vextracti128 $1, %ymm7, %xmm3
7424; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
7425; AVX512BW-NEXT:    vpor %xmm2, %xmm3, %xmm2
7426; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
7427; AVX512BW-NEXT:    vmovdqu8 %ymm2, %ymm0 {%k4}
7428; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
7429; AVX512BW-NEXT:    vmovdqa64 %zmm20, (%rsi)
7430; AVX512BW-NEXT:    vmovdqa64 %zmm14, (%rdx)
7431; AVX512BW-NEXT:    vmovdqa64 %zmm15, (%rcx)
7432; AVX512BW-NEXT:    vmovdqa64 %zmm4, (%r8)
7433; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%r9)
7434; AVX512BW-NEXT:    vzeroupper
7435; AVX512BW-NEXT:    retq
7436;
7437; AVX512BW-FCP-LABEL: load_i8_stride5_vf64:
7438; AVX512BW-FCP:       # %bb.0:
7439; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm3
7440; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
7441; AVX512BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
7442; AVX512BW-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
7443; AVX512BW-FCP-NEXT:    movw $21140, %ax # imm = 0x5294
7444; AVX512BW-FCP-NEXT:    kmovd %eax, %k2
7445; AVX512BW-FCP-NEXT:    vpblendmw %ymm1, %ymm0, %ymm4 {%k2}
7446; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
7447; AVX512BW-FCP-NEXT:    movl $1108344832, %eax # imm = 0x42100000
7448; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
7449; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm5, %ymm4 {%k1}
7450; AVX512BW-FCP-NEXT:    movw $19026, %ax # imm = 0x4A52
7451; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
7452; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm5 {%k1}
7453; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
7454; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
7455; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
7456; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm9
7457; AVX512BW-FCP-NEXT:    movl $67100672, %eax # imm = 0x3FFE000
7458; AVX512BW-FCP-NEXT:    kmovd %eax, %k5
7459; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
7460; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm6
7461; AVX512BW-FCP-NEXT:    vmovdqa 224(%rdi), %ymm5
7462; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm5, %ymm4 {%k1}
7463; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1]
7464; AVX512BW-FCP-NEXT:    movl $4228, %eax # imm = 0x1084
7465; AVX512BW-FCP-NEXT:    kmovd %eax, %k3
7466; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm7, %ymm4 {%k3}
7467; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
7468; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm4
7469; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u]
7470; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5]
7471; AVX512BW-FCP-NEXT:    vpermd %ymm8, %ymm19, %ymm8
7472; AVX512BW-FCP-NEXT:    movl $127, %eax
7473; AVX512BW-FCP-NEXT:    kmovd %eax, %k4
7474; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm8, %ymm7 {%k4}
7475; AVX512BW-FCP-NEXT:    vmovdqa 144(%rdi), %xmm11
7476; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11]
7477; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %xmm12
7478; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
7479; AVX512BW-FCP-NEXT:    vpor %xmm8, %xmm10, %xmm8
7480; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
7481; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm7
7482; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm7, %zmm9 {%k5}
7483; AVX512BW-FCP-NEXT:    vextracti64x4 $1, %zmm9, %ymm10
7484; AVX512BW-FCP-NEXT:    vmovdqa 256(%rdi), %ymm8
7485; AVX512BW-FCP-NEXT:    vmovdqa 288(%rdi), %ymm7
7486; AVX512BW-FCP-NEXT:    vpblendmw %ymm8, %ymm7, %ymm13 {%k2}
7487; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm14
7488; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11]
7489; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero
7490; AVX512BW-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm13
7491; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
7492; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7]
7493; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm20
7494; AVX512BW-FCP-NEXT:    movw $10570, %ax # imm = 0x294A
7495; AVX512BW-FCP-NEXT:    kmovd %eax, %k3
7496; AVX512BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm10 {%k3}
7497; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
7498; AVX512BW-FCP-NEXT:    movl $-2078212096, %eax # imm = 0x84210000
7499; AVX512BW-FCP-NEXT:    kmovd %eax, %k6
7500; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm13, %ymm10 {%k6}
7501; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm13 {%k2}
7502; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u]
7503; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
7504; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u]
7505; AVX512BW-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm14
7506; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
7507; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm5, %ymm10 {%k2}
7508; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
7509; AVX512BW-FCP-NEXT:    movl $8456, %eax # imm = 0x2108
7510; AVX512BW-FCP-NEXT:    kmovd %eax, %k6
7511; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm13, %ymm10 {%k6}
7512; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
7513; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %xmm10
7514; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u]
7515; AVX512BW-FCP-NEXT:    vmovdqa 176(%rdi), %xmm13
7516; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u]
7517; AVX512BW-FCP-NEXT:    vporq %xmm16, %xmm17, %xmm16
7518; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k4}
7519; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12]
7520; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
7521; AVX512BW-FCP-NEXT:    vporq %xmm16, %xmm17, %xmm16
7522; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
7523; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm16, %zmm15
7524; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm15, %zmm14 {%k5}
7525; AVX512BW-FCP-NEXT:    vextracti64x4 $1, %zmm14, %ymm15
7526; AVX512BW-FCP-NEXT:    vpblendmw %ymm7, %ymm8, %ymm16 {%k1}
7527; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm17
7528; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12]
7529; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero
7530; AVX512BW-FCP-NEXT:    vporq %xmm17, %xmm16, %xmm16
7531; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
7532; AVX512BW-FCP-NEXT:    movl $-524288, %eax # imm = 0xFFF80000
7533; AVX512BW-FCP-NEXT:    kmovd %eax, %k4
7534; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k4}
7535; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm14, %zmm14
7536; AVX512BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm15 {%k1}
7537; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1]
7538; AVX512BW-FCP-NEXT:    movl $138543104, %eax # imm = 0x8420000
7539; AVX512BW-FCP-NEXT:    kmovd %eax, %k6
7540; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k6}
7541; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm16 {%k3}
7542; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm17
7543; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u]
7544; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u]
7545; AVX512BW-FCP-NEXT:    vporq %xmm17, %xmm16, %xmm16
7546; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
7547; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm15 {%k1}
7548; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1]
7549; AVX512BW-FCP-NEXT:    movl $16912, %eax # imm = 0x4210
7550; AVX512BW-FCP-NEXT:    kmovd %eax, %k6
7551; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm15 {%k6}
7552; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
7553; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u]
7554; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
7555; AVX512BW-FCP-NEXT:    vporq %xmm17, %xmm18, %xmm4
7556; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7]
7557; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7]
7558; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13]
7559; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
7560; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm17, %xmm15
7561; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
7562; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm15, %zmm4
7563; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm4, %zmm16 {%k5}
7564; AVX512BW-FCP-NEXT:    vextracti64x4 $1, %zmm16, %ymm4
7565; AVX512BW-FCP-NEXT:    vpblendmw %ymm7, %ymm8, %ymm15 {%k2}
7566; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero
7567; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
7568; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13]
7569; AVX512BW-FCP-NEXT:    vporq %xmm17, %xmm15, %xmm15
7570; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
7571; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm4 {%k4}
7572; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm16, %zmm15
7573; AVX512BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm4 {%k2}
7574; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1]
7575; AVX512BW-FCP-NEXT:    movl $277086208, %eax # imm = 0x10840000
7576; AVX512BW-FCP-NEXT:    kmovd %eax, %k5
7577; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm4 {%k5}
7578; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm16 {%k1}
7579; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm16[3,8,13],zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[u,u,u]
7580; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm16
7581; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15,u,u,u]
7582; AVX512BW-FCP-NEXT:    vporq %xmm17, %xmm16, %xmm16
7583; AVX512BW-FCP-NEXT:    movl $33546240, %eax # imm = 0x1FFE000
7584; AVX512BW-FCP-NEXT:    kmovd %eax, %k5
7585; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm16 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
7586; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm4 {%k2}
7587; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm17 = ymm4[2,3,0,1]
7588; AVX512BW-FCP-NEXT:    movl $33825, %eax # imm = 0x8421
7589; AVX512BW-FCP-NEXT:    kmovd %eax, %k5
7590; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm4 {%k5}
7591; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
7592; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u]
7593; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
7594; AVX512BW-FCP-NEXT:    vporq %xmm17, %xmm18, %xmm9
7595; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7]
7596; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
7597; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14]
7598; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
7599; AVX512BW-FCP-NEXT:    vpor %xmm9, %xmm11, %xmm9
7600; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
7601; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm9, %zmm4
7602; AVX512BW-FCP-NEXT:    movl $33554431, %eax # imm = 0x1FFFFFF
7603; AVX512BW-FCP-NEXT:    kmovq %rax, %k5
7604; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm16, %zmm4 {%k5}
7605; AVX512BW-FCP-NEXT:    vextracti64x4 $1, %zmm4, %ymm9
7606; AVX512BW-FCP-NEXT:    vpblendmw %ymm8, %ymm7, %ymm11 {%k3}
7607; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
7608; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14]
7609; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero
7610; AVX512BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
7611; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
7612; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm11, %ymm9 {%k4}
7613; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm4, %zmm4
7614; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k2}
7615; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
7616; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
7617; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
7618; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
7619; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
7620; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
7621; AVX512BW-FCP-NEXT:    movl $554172416, %eax # imm = 0x21080000
7622; AVX512BW-FCP-NEXT:    kmovd %eax, %k2
7623; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k2}
7624; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
7625; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
7626; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm5 {%k3}
7627; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1]
7628; AVX512BW-FCP-NEXT:    movl $2114, %eax # imm = 0x842
7629; AVX512BW-FCP-NEXT:    kmovd %eax, %k2
7630; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm1, %ymm5 {%k2}
7631; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
7632; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u]
7633; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
7634; AVX512BW-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
7635; AVX512BW-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
7636; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
7637; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm2
7638; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
7639; AVX512BW-FCP-NEXT:    vpermd %ymm2, %ymm19, %ymm2
7640; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
7641; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k5}
7642; AVX512BW-FCP-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
7643; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm8, %ymm7 {%k1}
7644; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero
7645; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm3
7646; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
7647; AVX512BW-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
7648; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
7649; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm2, %ymm0 {%k4}
7650; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
7651; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm20, (%rsi)
7652; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, (%rdx)
7653; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, (%rcx)
7654; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, (%r8)
7655; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, (%r9)
7656; AVX512BW-FCP-NEXT:    vzeroupper
7657; AVX512BW-FCP-NEXT:    retq
7658;
7659; AVX512DQ-BW-LABEL: load_i8_stride5_vf64:
7660; AVX512DQ-BW:       # %bb.0:
7661; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm3
7662; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm2
7663; AVX512DQ-BW-NEXT:    vmovdqa 64(%rdi), %ymm0
7664; AVX512DQ-BW-NEXT:    vmovdqa 96(%rdi), %ymm1
7665; AVX512DQ-BW-NEXT:    movw $21140, %ax # imm = 0x5294
7666; AVX512DQ-BW-NEXT:    kmovd %eax, %k2
7667; AVX512DQ-BW-NEXT:    vpblendmw %ymm1, %ymm0, %ymm4 {%k2}
7668; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
7669; AVX512DQ-BW-NEXT:    movl $1108344832, %eax # imm = 0x42100000
7670; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
7671; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm5, %ymm4 {%k1}
7672; AVX512DQ-BW-NEXT:    movw $19026, %ax # imm = 0x4A52
7673; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
7674; AVX512DQ-BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm5 {%k1}
7675; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm5, %xmm6
7676; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
7677; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
7678; AVX512DQ-BW-NEXT:    vpor %xmm6, %xmm5, %xmm9
7679; AVX512DQ-BW-NEXT:    movl $67100672, %eax # imm = 0x3FFE000
7680; AVX512DQ-BW-NEXT:    kmovd %eax, %k5
7681; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
7682; AVX512DQ-BW-NEXT:    vmovdqa 192(%rdi), %ymm6
7683; AVX512DQ-BW-NEXT:    vmovdqa 224(%rdi), %ymm5
7684; AVX512DQ-BW-NEXT:    vpblendmw %ymm6, %ymm5, %ymm4 {%k1}
7685; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1]
7686; AVX512DQ-BW-NEXT:    movl $4228, %eax # imm = 0x1084
7687; AVX512DQ-BW-NEXT:    kmovd %eax, %k3
7688; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm7, %ymm4 {%k3}
7689; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
7690; AVX512DQ-BW-NEXT:    vmovdqa 160(%rdi), %ymm4
7691; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u]
7692; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5]
7693; AVX512DQ-BW-NEXT:    vpermd %ymm8, %ymm19, %ymm8
7694; AVX512DQ-BW-NEXT:    movl $127, %eax
7695; AVX512DQ-BW-NEXT:    kmovd %eax, %k4
7696; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm8, %ymm7 {%k4}
7697; AVX512DQ-BW-NEXT:    vmovdqa 144(%rdi), %xmm11
7698; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11]
7699; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %xmm12
7700; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
7701; AVX512DQ-BW-NEXT:    vpor %xmm8, %xmm10, %xmm8
7702; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
7703; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm7
7704; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm7, %zmm9 {%k5}
7705; AVX512DQ-BW-NEXT:    vextracti64x4 $1, %zmm9, %ymm10
7706; AVX512DQ-BW-NEXT:    vmovdqa 256(%rdi), %ymm8
7707; AVX512DQ-BW-NEXT:    vmovdqa 288(%rdi), %ymm7
7708; AVX512DQ-BW-NEXT:    vpblendmw %ymm8, %ymm7, %ymm13 {%k2}
7709; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm13, %xmm14
7710; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11]
7711; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero
7712; AVX512DQ-BW-NEXT:    vpor %xmm14, %xmm13, %xmm13
7713; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
7714; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7]
7715; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm20
7716; AVX512DQ-BW-NEXT:    movw $10570, %ax # imm = 0x294A
7717; AVX512DQ-BW-NEXT:    kmovd %eax, %k3
7718; AVX512DQ-BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm10 {%k3}
7719; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
7720; AVX512DQ-BW-NEXT:    movl $-2078212096, %eax # imm = 0x84210000
7721; AVX512DQ-BW-NEXT:    kmovd %eax, %k6
7722; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm13, %ymm10 {%k6}
7723; AVX512DQ-BW-NEXT:    vpblendmw %ymm2, %ymm3, %ymm13 {%k2}
7724; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u]
7725; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm13, %xmm13
7726; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u]
7727; AVX512DQ-BW-NEXT:    vpor %xmm14, %xmm13, %xmm14
7728; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
7729; AVX512DQ-BW-NEXT:    vpblendmw %ymm6, %ymm5, %ymm10 {%k2}
7730; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
7731; AVX512DQ-BW-NEXT:    movl $8456, %eax # imm = 0x2108
7732; AVX512DQ-BW-NEXT:    kmovd %eax, %k6
7733; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm13, %ymm10 {%k6}
7734; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
7735; AVX512DQ-BW-NEXT:    vmovdqa 160(%rdi), %xmm10
7736; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u]
7737; AVX512DQ-BW-NEXT:    vmovdqa 176(%rdi), %xmm13
7738; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u]
7739; AVX512DQ-BW-NEXT:    vporq %xmm16, %xmm17, %xmm16
7740; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k4}
7741; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12]
7742; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
7743; AVX512DQ-BW-NEXT:    vporq %xmm16, %xmm17, %xmm16
7744; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
7745; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm15, %zmm16, %zmm15
7746; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm15, %zmm14 {%k5}
7747; AVX512DQ-BW-NEXT:    vextracti64x4 $1, %zmm14, %ymm15
7748; AVX512DQ-BW-NEXT:    vpblendmw %ymm7, %ymm8, %ymm16 {%k1}
7749; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm16, %xmm17
7750; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12]
7751; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero
7752; AVX512DQ-BW-NEXT:    vporq %xmm17, %xmm16, %xmm16
7753; AVX512DQ-BW-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
7754; AVX512DQ-BW-NEXT:    movl $-524288, %eax # imm = 0xFFF80000
7755; AVX512DQ-BW-NEXT:    kmovd %eax, %k4
7756; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k4}
7757; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm15, %zmm14, %zmm14
7758; AVX512DQ-BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm15 {%k1}
7759; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1]
7760; AVX512DQ-BW-NEXT:    movl $138543104, %eax # imm = 0x8420000
7761; AVX512DQ-BW-NEXT:    kmovd %eax, %k6
7762; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k6}
7763; AVX512DQ-BW-NEXT:    vpblendmw %ymm3, %ymm2, %ymm16 {%k3}
7764; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm16, %xmm17
7765; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u]
7766; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u]
7767; AVX512DQ-BW-NEXT:    vporq %xmm17, %xmm16, %xmm16
7768; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
7769; AVX512DQ-BW-NEXT:    vpblendmw %ymm5, %ymm6, %ymm15 {%k1}
7770; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1]
7771; AVX512DQ-BW-NEXT:    movl $16912, %eax # imm = 0x4210
7772; AVX512DQ-BW-NEXT:    kmovd %eax, %k6
7773; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm17, %ymm15 {%k6}
7774; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
7775; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u]
7776; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
7777; AVX512DQ-BW-NEXT:    vporq %xmm17, %xmm18, %xmm4
7778; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7]
7779; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7]
7780; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13]
7781; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
7782; AVX512DQ-BW-NEXT:    vporq %xmm15, %xmm17, %xmm15
7783; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
7784; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm15, %zmm4
7785; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm4, %zmm16 {%k5}
7786; AVX512DQ-BW-NEXT:    vextracti64x4 $1, %zmm16, %ymm4
7787; AVX512DQ-BW-NEXT:    vpblendmw %ymm7, %ymm8, %ymm15 {%k2}
7788; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero
7789; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm15, %xmm15
7790; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13]
7791; AVX512DQ-BW-NEXT:    vporq %xmm17, %xmm15, %xmm15
7792; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
7793; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm15, %ymm4 {%k4}
7794; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm16, %zmm15
7795; AVX512DQ-BW-NEXT:    vpblendmw %ymm0, %ymm1, %ymm4 {%k2}
7796; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1]
7797; AVX512DQ-BW-NEXT:    movl $277086208, %eax # imm = 0x10840000
7798; AVX512DQ-BW-NEXT:    kmovd %eax, %k5
7799; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm16, %ymm4 {%k5}
7800; AVX512DQ-BW-NEXT:    vpblendmw %ymm3, %ymm2, %ymm16 {%k1}
7801; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm17 = xmm16[3,8,13],zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[u,u,u]
7802; AVX512DQ-BW-NEXT:    vextracti32x4 $1, %ymm16, %xmm16
7803; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15,u,u,u]
7804; AVX512DQ-BW-NEXT:    vporq %xmm17, %xmm16, %xmm16
7805; AVX512DQ-BW-NEXT:    movl $33546240, %eax # imm = 0x1FFE000
7806; AVX512DQ-BW-NEXT:    kmovd %eax, %k5
7807; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm16 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
7808; AVX512DQ-BW-NEXT:    vpblendmw %ymm5, %ymm6, %ymm4 {%k2}
7809; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm17 = ymm4[2,3,0,1]
7810; AVX512DQ-BW-NEXT:    movl $33825, %eax # imm = 0x8421
7811; AVX512DQ-BW-NEXT:    kmovd %eax, %k5
7812; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm17, %ymm4 {%k5}
7813; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
7814; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u]
7815; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
7816; AVX512DQ-BW-NEXT:    vporq %xmm17, %xmm18, %xmm9
7817; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7]
7818; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
7819; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14]
7820; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
7821; AVX512DQ-BW-NEXT:    vpor %xmm9, %xmm11, %xmm9
7822; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
7823; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm9, %zmm4
7824; AVX512DQ-BW-NEXT:    movl $33554431, %eax # imm = 0x1FFFFFF
7825; AVX512DQ-BW-NEXT:    kmovq %rax, %k5
7826; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm16, %zmm4 {%k5}
7827; AVX512DQ-BW-NEXT:    vextracti64x4 $1, %zmm4, %ymm9
7828; AVX512DQ-BW-NEXT:    vpblendmw %ymm8, %ymm7, %ymm11 {%k3}
7829; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm11, %xmm12
7830; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14]
7831; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero
7832; AVX512DQ-BW-NEXT:    vpor %xmm12, %xmm11, %xmm11
7833; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
7834; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm11, %ymm9 {%k4}
7835; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm9, %zmm4, %zmm4
7836; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k2}
7837; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
7838; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
7839; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
7840; AVX512DQ-BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
7841; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
7842; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
7843; AVX512DQ-BW-NEXT:    movl $554172416, %eax # imm = 0x21080000
7844; AVX512DQ-BW-NEXT:    kmovd %eax, %k2
7845; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k2}
7846; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
7847; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
7848; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm6, %ymm5 {%k3}
7849; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1]
7850; AVX512DQ-BW-NEXT:    movl $2114, %eax # imm = 0x842
7851; AVX512DQ-BW-NEXT:    kmovd %eax, %k2
7852; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm1, %ymm5 {%k2}
7853; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
7854; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u]
7855; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
7856; AVX512DQ-BW-NEXT:    vpor %xmm2, %xmm3, %xmm2
7857; AVX512DQ-BW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
7858; AVX512DQ-BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
7859; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %ymm2
7860; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
7861; AVX512DQ-BW-NEXT:    vpermd %ymm2, %ymm19, %ymm2
7862; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
7863; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k5}
7864; AVX512DQ-BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
7865; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm8, %ymm7 {%k1}
7866; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero
7867; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm7, %xmm3
7868; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
7869; AVX512DQ-BW-NEXT:    vpor %xmm2, %xmm3, %xmm2
7870; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
7871; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm2, %ymm0 {%k4}
7872; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
7873; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm20, (%rsi)
7874; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, (%rdx)
7875; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, (%rcx)
7876; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, (%r8)
7877; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, (%r9)
7878; AVX512DQ-BW-NEXT:    vzeroupper
7879; AVX512DQ-BW-NEXT:    retq
7880;
7881; AVX512DQ-BW-FCP-LABEL: load_i8_stride5_vf64:
7882; AVX512DQ-BW-FCP:       # %bb.0:
7883; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm3
7884; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
7885; AVX512DQ-BW-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
7886; AVX512DQ-BW-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
7887; AVX512DQ-BW-FCP-NEXT:    movw $21140, %ax # imm = 0x5294
7888; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k2
7889; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm1, %ymm0, %ymm4 {%k2}
7890; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
7891; AVX512DQ-BW-FCP-NEXT:    movl $1108344832, %eax # imm = 0x42100000
7892; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
7893; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm5, %ymm4 {%k1}
7894; AVX512DQ-BW-FCP-NEXT:    movw $19026, %ax # imm = 0x4A52
7895; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
7896; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm5 {%k1}
7897; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm6
7898; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
7899; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
7900; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm5, %xmm9
7901; AVX512DQ-BW-FCP-NEXT:    movl $67100672, %eax # imm = 0x3FFE000
7902; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k5
7903; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
7904; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm6
7905; AVX512DQ-BW-FCP-NEXT:    vmovdqa 224(%rdi), %ymm5
7906; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm5, %ymm4 {%k1}
7907; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1]
7908; AVX512DQ-BW-FCP-NEXT:    movl $4228, %eax # imm = 0x1084
7909; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k3
7910; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm7, %ymm4 {%k3}
7911; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
7912; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm4
7913; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u]
7914; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5]
7915; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm8, %ymm19, %ymm8
7916; AVX512DQ-BW-FCP-NEXT:    movl $127, %eax
7917; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k4
7918; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm8, %ymm7 {%k4}
7919; AVX512DQ-BW-FCP-NEXT:    vmovdqa 144(%rdi), %xmm11
7920; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11]
7921; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %xmm12
7922; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
7923; AVX512DQ-BW-FCP-NEXT:    vpor %xmm8, %xmm10, %xmm8
7924; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
7925; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm7
7926; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm7, %zmm9 {%k5}
7927; AVX512DQ-BW-FCP-NEXT:    vextracti64x4 $1, %zmm9, %ymm10
7928; AVX512DQ-BW-FCP-NEXT:    vmovdqa 256(%rdi), %ymm8
7929; AVX512DQ-BW-FCP-NEXT:    vmovdqa 288(%rdi), %ymm7
7930; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm8, %ymm7, %ymm13 {%k2}
7931; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm14
7932; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11]
7933; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero
7934; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm13
7935; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
7936; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7]
7937; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm20
7938; AVX512DQ-BW-FCP-NEXT:    movw $10570, %ax # imm = 0x294A
7939; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k3
7940; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm10 {%k3}
7941; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
7942; AVX512DQ-BW-FCP-NEXT:    movl $-2078212096, %eax # imm = 0x84210000
7943; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k6
7944; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm13, %ymm10 {%k6}
7945; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm3, %ymm13 {%k2}
7946; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u]
7947; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
7948; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u]
7949; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm14
7950; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
7951; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm5, %ymm10 {%k2}
7952; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
7953; AVX512DQ-BW-FCP-NEXT:    movl $8456, %eax # imm = 0x2108
7954; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k6
7955; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm13, %ymm10 {%k6}
7956; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
7957; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %xmm10
7958; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u]
7959; AVX512DQ-BW-FCP-NEXT:    vmovdqa 176(%rdi), %xmm13
7960; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u]
7961; AVX512DQ-BW-FCP-NEXT:    vporq %xmm16, %xmm17, %xmm16
7962; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k4}
7963; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12]
7964; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
7965; AVX512DQ-BW-FCP-NEXT:    vporq %xmm16, %xmm17, %xmm16
7966; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
7967; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm16, %zmm15
7968; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm15, %zmm14 {%k5}
7969; AVX512DQ-BW-FCP-NEXT:    vextracti64x4 $1, %zmm14, %ymm15
7970; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm7, %ymm8, %ymm16 {%k1}
7971; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm17
7972; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12]
7973; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero
7974; AVX512DQ-BW-FCP-NEXT:    vporq %xmm17, %xmm16, %xmm16
7975; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
7976; AVX512DQ-BW-FCP-NEXT:    movl $-524288, %eax # imm = 0xFFF80000
7977; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k4
7978; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k4}
7979; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm14, %zmm14
7980; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm15 {%k1}
7981; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1]
7982; AVX512DQ-BW-FCP-NEXT:    movl $138543104, %eax # imm = 0x8420000
7983; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k6
7984; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k6}
7985; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm16 {%k3}
7986; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm17
7987; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u]
7988; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u]
7989; AVX512DQ-BW-FCP-NEXT:    vporq %xmm17, %xmm16, %xmm16
7990; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
7991; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm15 {%k1}
7992; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1]
7993; AVX512DQ-BW-FCP-NEXT:    movl $16912, %eax # imm = 0x4210
7994; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k6
7995; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm15 {%k6}
7996; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
7997; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u]
7998; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
7999; AVX512DQ-BW-FCP-NEXT:    vporq %xmm17, %xmm18, %xmm4
8000; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7]
8001; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7]
8002; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13]
8003; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
8004; AVX512DQ-BW-FCP-NEXT:    vporq %xmm15, %xmm17, %xmm15
8005; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
8006; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm15, %zmm4
8007; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm4, %zmm16 {%k5}
8008; AVX512DQ-BW-FCP-NEXT:    vextracti64x4 $1, %zmm16, %ymm4
8009; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm7, %ymm8, %ymm15 {%k2}
8010; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero
8011; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
8012; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13]
8013; AVX512DQ-BW-FCP-NEXT:    vporq %xmm17, %xmm15, %xmm15
8014; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
8015; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm4 {%k4}
8016; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm16, %zmm15
8017; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm0, %ymm1, %ymm4 {%k2}
8018; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1]
8019; AVX512DQ-BW-FCP-NEXT:    movl $277086208, %eax # imm = 0x10840000
8020; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k5
8021; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm4 {%k5}
8022; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm2, %ymm16 {%k1}
8023; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm16[3,8,13],zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[u,u,u]
8024; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm16
8025; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15,u,u,u]
8026; AVX512DQ-BW-FCP-NEXT:    vporq %xmm17, %xmm16, %xmm16
8027; AVX512DQ-BW-FCP-NEXT:    movl $33546240, %eax # imm = 0x1FFE000
8028; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k5
8029; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm16 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
8030; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm4 {%k2}
8031; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm17 = ymm4[2,3,0,1]
8032; AVX512DQ-BW-FCP-NEXT:    movl $33825, %eax # imm = 0x8421
8033; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k5
8034; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm4 {%k5}
8035; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
8036; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u]
8037; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
8038; AVX512DQ-BW-FCP-NEXT:    vporq %xmm17, %xmm18, %xmm9
8039; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7]
8040; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
8041; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14]
8042; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
8043; AVX512DQ-BW-FCP-NEXT:    vpor %xmm9, %xmm11, %xmm9
8044; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
8045; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm9, %zmm4
8046; AVX512DQ-BW-FCP-NEXT:    movl $33554431, %eax # imm = 0x1FFFFFF
8047; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k5
8048; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm16, %zmm4 {%k5}
8049; AVX512DQ-BW-FCP-NEXT:    vextracti64x4 $1, %zmm4, %ymm9
8050; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm8, %ymm7, %ymm11 {%k3}
8051; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm12
8052; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14]
8053; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero
8054; AVX512DQ-BW-FCP-NEXT:    vpor %xmm12, %xmm11, %xmm11
8055; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
8056; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm11, %ymm9 {%k4}
8057; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm4, %zmm4
8058; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm3, %ymm2 {%k2}
8059; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm3
8060; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
8061; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
8062; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm2, %xmm2
8063; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
8064; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
8065; AVX512DQ-BW-FCP-NEXT:    movl $554172416, %eax # imm = 0x21080000
8066; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k2
8067; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k2}
8068; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
8069; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
8070; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm5 {%k3}
8071; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1]
8072; AVX512DQ-BW-FCP-NEXT:    movl $2114, %eax # imm = 0x842
8073; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k2
8074; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm1, %ymm5 {%k2}
8075; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
8076; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u]
8077; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
8078; AVX512DQ-BW-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
8079; AVX512DQ-BW-FCP-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
8080; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
8081; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm2
8082; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
8083; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm2, %ymm19, %ymm2
8084; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
8085; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k5}
8086; AVX512DQ-BW-FCP-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
8087; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm8, %ymm7 {%k1}
8088; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero
8089; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm3
8090; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
8091; AVX512DQ-BW-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
8092; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
8093; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm2, %ymm0 {%k4}
8094; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
8095; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm20, (%rsi)
8096; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, (%rdx)
8097; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, (%rcx)
8098; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, (%r8)
8099; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, (%r9)
8100; AVX512DQ-BW-FCP-NEXT:    vzeroupper
8101; AVX512DQ-BW-FCP-NEXT:    retq
8102  %wide.vec = load <320 x i8>, ptr %in.vec, align 64
8103  %strided.vec0 = shufflevector <320 x i8> %wide.vec, <320 x i8> poison, <64 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75, i32 80, i32 85, i32 90, i32 95, i32 100, i32 105, i32 110, i32 115, i32 120, i32 125, i32 130, i32 135, i32 140, i32 145, i32 150, i32 155, i32 160, i32 165, i32 170, i32 175, i32 180, i32 185, i32 190, i32 195, i32 200, i32 205, i32 210, i32 215, i32 220, i32 225, i32 230, i32 235, i32 240, i32 245, i32 250, i32 255, i32 260, i32 265, i32 270, i32 275, i32 280, i32 285, i32 290, i32 295, i32 300, i32 305, i32 310, i32 315>
8104  %strided.vec1 = shufflevector <320 x i8> %wide.vec, <320 x i8> poison, <64 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36, i32 41, i32 46, i32 51, i32 56, i32 61, i32 66, i32 71, i32 76, i32 81, i32 86, i32 91, i32 96, i32 101, i32 106, i32 111, i32 116, i32 121, i32 126, i32 131, i32 136, i32 141, i32 146, i32 151, i32 156, i32 161, i32 166, i32 171, i32 176, i32 181, i32 186, i32 191, i32 196, i32 201, i32 206, i32 211, i32 216, i32 221, i32 226, i32 231, i32 236, i32 241, i32 246, i32 251, i32 256, i32 261, i32 266, i32 271, i32 276, i32 281, i32 286, i32 291, i32 296, i32 301, i32 306, i32 311, i32 316>
8105  %strided.vec2 = shufflevector <320 x i8> %wide.vec, <320 x i8> poison, <64 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37, i32 42, i32 47, i32 52, i32 57, i32 62, i32 67, i32 72, i32 77, i32 82, i32 87, i32 92, i32 97, i32 102, i32 107, i32 112, i32 117, i32 122, i32 127, i32 132, i32 137, i32 142, i32 147, i32 152, i32 157, i32 162, i32 167, i32 172, i32 177, i32 182, i32 187, i32 192, i32 197, i32 202, i32 207, i32 212, i32 217, i32 222, i32 227, i32 232, i32 237, i32 242, i32 247, i32 252, i32 257, i32 262, i32 267, i32 272, i32 277, i32 282, i32 287, i32 292, i32 297, i32 302, i32 307, i32 312, i32 317>
8106  %strided.vec3 = shufflevector <320 x i8> %wide.vec, <320 x i8> poison, <64 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38, i32 43, i32 48, i32 53, i32 58, i32 63, i32 68, i32 73, i32 78, i32 83, i32 88, i32 93, i32 98, i32 103, i32 108, i32 113, i32 118, i32 123, i32 128, i32 133, i32 138, i32 143, i32 148, i32 153, i32 158, i32 163, i32 168, i32 173, i32 178, i32 183, i32 188, i32 193, i32 198, i32 203, i32 208, i32 213, i32 218, i32 223, i32 228, i32 233, i32 238, i32 243, i32 248, i32 253, i32 258, i32 263, i32 268, i32 273, i32 278, i32 283, i32 288, i32 293, i32 298, i32 303, i32 308, i32 313, i32 318>
8107  %strided.vec4 = shufflevector <320 x i8> %wide.vec, <320 x i8> poison, <64 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39, i32 44, i32 49, i32 54, i32 59, i32 64, i32 69, i32 74, i32 79, i32 84, i32 89, i32 94, i32 99, i32 104, i32 109, i32 114, i32 119, i32 124, i32 129, i32 134, i32 139, i32 144, i32 149, i32 154, i32 159, i32 164, i32 169, i32 174, i32 179, i32 184, i32 189, i32 194, i32 199, i32 204, i32 209, i32 214, i32 219, i32 224, i32 229, i32 234, i32 239, i32 244, i32 249, i32 254, i32 259, i32 264, i32 269, i32 274, i32 279, i32 284, i32 289, i32 294, i32 299, i32 304, i32 309, i32 314, i32 319>
8108  store <64 x i8> %strided.vec0, ptr %out.vec0, align 64
8109  store <64 x i8> %strided.vec1, ptr %out.vec1, align 64
8110  store <64 x i8> %strided.vec2, ptr %out.vec2, align 64
8111  store <64 x i8> %strided.vec3, ptr %out.vec3, align 64
8112  store <64 x i8> %strided.vec4, ptr %out.vec4, align 64
8113  ret void
8114}
8115