xref: /llvm-project/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll (revision 8ac00ca4867835cacaf013f5c442658b9b1bce38)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx  | FileCheck %s --check-prefixes=AVX
4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
15
16; These patterns are produced by LoopVectorizer for interleaved stores.
17
18define void @store_i8_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind {
19; SSE-LABEL: store_i8_stride7_vf2:
20; SSE:       # %bb.0:
21; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
22; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r10
23; SSE-NEXT:    movdqa (%rdi), %xmm0
24; SSE-NEXT:    movdqa (%rdx), %xmm1
25; SSE-NEXT:    movdqa (%r8), %xmm2
26; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
27; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
28; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
29; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
30; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
31; SSE-NEXT:    pxor %xmm1, %xmm1
32; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
33; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,5,6,7]
34; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
35; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,0,3,4,5,6,7]
36; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
37; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
38; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,1]
39; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
40; SSE-NEXT:    packuswb %xmm3, %xmm0
41; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
42; SSE-NEXT:    pand %xmm3, %xmm0
43; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
44; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,0,2,1]
45; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
46; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
47; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
48; SSE-NEXT:    packuswb %xmm1, %xmm2
49; SSE-NEXT:    pandn %xmm2, %xmm3
50; SSE-NEXT:    por %xmm0, %xmm3
51; SSE-NEXT:    pextrw $6, %xmm2, %ecx
52; SSE-NEXT:    movw %cx, 12(%rax)
53; SSE-NEXT:    movq %xmm3, (%rax)
54; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
55; SSE-NEXT:    movd %xmm0, 8(%rax)
56; SSE-NEXT:    retq
57;
58; AVX-LABEL: store_i8_stride7_vf2:
59; AVX:       # %bb.0:
60; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
61; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r10
62; AVX-NEXT:    vmovdqa (%rdi), %xmm0
63; AVX-NEXT:    vmovdqa (%rdx), %xmm1
64; AVX-NEXT:    vmovdqa (%r8), %xmm2
65; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
66; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
67; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
68; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
69; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
70; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
71; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u]
72; AVX-NEXT:    vpextrw $6, %xmm0, 12(%rax)
73; AVX-NEXT:    vpextrd $2, %xmm0, 8(%rax)
74; AVX-NEXT:    vmovq %xmm0, (%rax)
75; AVX-NEXT:    retq
76;
77; AVX2-LABEL: store_i8_stride7_vf2:
78; AVX2:       # %bb.0:
79; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
80; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
81; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
82; AVX2-NEXT:    vmovdqa (%rdx), %xmm1
83; AVX2-NEXT:    vmovdqa (%r8), %xmm2
84; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
85; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
86; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
87; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
88; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
89; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
90; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u]
91; AVX2-NEXT:    vpextrw $6, %xmm0, 12(%rax)
92; AVX2-NEXT:    vpextrd $2, %xmm0, 8(%rax)
93; AVX2-NEXT:    vmovq %xmm0, (%rax)
94; AVX2-NEXT:    retq
95;
96; AVX2-FP-LABEL: store_i8_stride7_vf2:
97; AVX2-FP:       # %bb.0:
98; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
99; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
100; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
101; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm1
102; AVX2-FP-NEXT:    vmovdqa (%r8), %xmm2
103; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
104; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
105; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
106; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
107; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
108; AVX2-FP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
109; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u]
110; AVX2-FP-NEXT:    vpextrw $6, %xmm0, 12(%rax)
111; AVX2-FP-NEXT:    vpextrd $2, %xmm0, 8(%rax)
112; AVX2-FP-NEXT:    vmovq %xmm0, (%rax)
113; AVX2-FP-NEXT:    retq
114;
115; AVX2-FCP-LABEL: store_i8_stride7_vf2:
116; AVX2-FCP:       # %bb.0:
117; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
118; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
119; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm0
120; AVX2-FCP-NEXT:    vmovdqa (%rdx), %xmm1
121; AVX2-FCP-NEXT:    vmovdqa (%r8), %xmm2
122; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
123; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
124; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
125; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
126; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
127; AVX2-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
128; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u]
129; AVX2-FCP-NEXT:    vpextrw $6, %xmm0, 12(%rax)
130; AVX2-FCP-NEXT:    vpextrd $2, %xmm0, 8(%rax)
131; AVX2-FCP-NEXT:    vmovq %xmm0, (%rax)
132; AVX2-FCP-NEXT:    retq
133;
134; AVX512-LABEL: store_i8_stride7_vf2:
135; AVX512:       # %bb.0:
136; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
137; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
138; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
139; AVX512-NEXT:    vmovdqa (%rdx), %xmm1
140; AVX512-NEXT:    vmovdqa (%r8), %xmm2
141; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
142; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
143; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
144; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
145; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
146; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
147; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u]
148; AVX512-NEXT:    vpextrw $6, %xmm0, 12(%rax)
149; AVX512-NEXT:    vpextrd $2, %xmm0, 8(%rax)
150; AVX512-NEXT:    vmovq %xmm0, (%rax)
151; AVX512-NEXT:    retq
152;
153; AVX512-FCP-LABEL: store_i8_stride7_vf2:
154; AVX512-FCP:       # %bb.0:
155; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
156; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
157; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
158; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm1
159; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm2
160; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
161; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
162; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
163; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
164; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
165; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
166; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u]
167; AVX512-FCP-NEXT:    vpextrw $6, %xmm0, 12(%rax)
168; AVX512-FCP-NEXT:    vpextrd $2, %xmm0, 8(%rax)
169; AVX512-FCP-NEXT:    vmovq %xmm0, (%rax)
170; AVX512-FCP-NEXT:    retq
171;
172; AVX512DQ-LABEL: store_i8_stride7_vf2:
173; AVX512DQ:       # %bb.0:
174; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
175; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
176; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
177; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm1
178; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm2
179; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
180; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
181; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
182; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
183; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
184; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
185; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u]
186; AVX512DQ-NEXT:    vpextrw $6, %xmm0, 12(%rax)
187; AVX512DQ-NEXT:    vpextrd $2, %xmm0, 8(%rax)
188; AVX512DQ-NEXT:    vmovq %xmm0, (%rax)
189; AVX512DQ-NEXT:    retq
190;
191; AVX512DQ-FCP-LABEL: store_i8_stride7_vf2:
192; AVX512DQ-FCP:       # %bb.0:
193; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
194; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
195; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
196; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm1
197; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm2
198; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
199; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
200; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
201; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
202; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
203; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
204; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u]
205; AVX512DQ-FCP-NEXT:    vpextrw $6, %xmm0, 12(%rax)
206; AVX512DQ-FCP-NEXT:    vpextrd $2, %xmm0, 8(%rax)
207; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%rax)
208; AVX512DQ-FCP-NEXT:    retq
209;
210; AVX512BW-LABEL: store_i8_stride7_vf2:
211; AVX512BW:       # %bb.0:
212; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
213; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
214; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
215; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm1
216; AVX512BW-NEXT:    vmovdqa (%r8), %xmm2
217; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
218; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
219; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
220; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
221; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
222; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
223; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u]
224; AVX512BW-NEXT:    vpextrw $6, %xmm0, 12(%rax)
225; AVX512BW-NEXT:    vpextrd $2, %xmm0, 8(%rax)
226; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
227; AVX512BW-NEXT:    retq
228;
229; AVX512BW-FCP-LABEL: store_i8_stride7_vf2:
230; AVX512BW-FCP:       # %bb.0:
231; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
232; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
233; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
234; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %xmm1
235; AVX512BW-FCP-NEXT:    vmovdqa (%r8), %xmm2
236; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
237; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
238; AVX512BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
239; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
240; AVX512BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
241; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
242; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u]
243; AVX512BW-FCP-NEXT:    vpextrw $6, %xmm0, 12(%rax)
244; AVX512BW-FCP-NEXT:    vpextrd $2, %xmm0, 8(%rax)
245; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%rax)
246; AVX512BW-FCP-NEXT:    retq
247;
248; AVX512DQ-BW-LABEL: store_i8_stride7_vf2:
249; AVX512DQ-BW:       # %bb.0:
250; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
251; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
252; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
253; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm1
254; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %xmm2
255; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
256; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
257; AVX512DQ-BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
258; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
259; AVX512DQ-BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
260; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
261; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u]
262; AVX512DQ-BW-NEXT:    vpextrw $6, %xmm0, 12(%rax)
263; AVX512DQ-BW-NEXT:    vpextrd $2, %xmm0, 8(%rax)
264; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%rax)
265; AVX512DQ-BW-NEXT:    retq
266;
267; AVX512DQ-BW-FCP-LABEL: store_i8_stride7_vf2:
268; AVX512DQ-BW-FCP:       # %bb.0:
269; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
270; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
271; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
272; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %xmm1
273; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r8), %xmm2
274; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
275; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
276; AVX512DQ-BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
277; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
278; AVX512DQ-BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
279; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
280; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u]
281; AVX512DQ-BW-FCP-NEXT:    vpextrw $6, %xmm0, 12(%rax)
282; AVX512DQ-BW-FCP-NEXT:    vpextrd $2, %xmm0, 8(%rax)
283; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%rax)
284; AVX512DQ-BW-FCP-NEXT:    retq
285  %in.vec0 = load <2 x i8>, ptr %in.vecptr0, align 64
286  %in.vec1 = load <2 x i8>, ptr %in.vecptr1, align 64
287  %in.vec2 = load <2 x i8>, ptr %in.vecptr2, align 64
288  %in.vec3 = load <2 x i8>, ptr %in.vecptr3, align 64
289  %in.vec4 = load <2 x i8>, ptr %in.vecptr4, align 64
290  %in.vec5 = load <2 x i8>, ptr %in.vecptr5, align 64
291  %in.vec6 = load <2 x i8>, ptr %in.vecptr6, align 64
292  %1 = shufflevector <2 x i8> %in.vec0, <2 x i8> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
293  %2 = shufflevector <2 x i8> %in.vec2, <2 x i8> %in.vec3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
294  %3 = shufflevector <2 x i8> %in.vec4, <2 x i8> %in.vec5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
295  %4 = shufflevector <4 x i8> %1, <4 x i8> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
296  %5 = shufflevector <2 x i8> %in.vec6, <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
297  %6 = shufflevector <4 x i8> %3, <4 x i8> %5, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
298  %7 = shufflevector <6 x i8> %6, <6 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 undef, i32 undef>
299  %8 = shufflevector <8 x i8> %4, <8 x i8> %7, <14 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13>
300  %interleaved.vec = shufflevector <14 x i8> %8, <14 x i8> poison, <14 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13>
301  store <14 x i8> %interleaved.vec, ptr %out.vec, align 64
302  ret void
303}
304
305define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind {
306; SSE-LABEL: store_i8_stride7_vf4:
307; SSE:       # %bb.0:
308; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
309; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r10
310; SSE-NEXT:    movdqa (%rdi), %xmm0
311; SSE-NEXT:    movdqa (%rdx), %xmm3
312; SSE-NEXT:    movdqa (%r8), %xmm5
313; SSE-NEXT:    movdqa (%r10), %xmm2
314; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
315; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
316; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],mem[0],xmm5[1],mem[1]
317; SSE-NEXT:    pxor %xmm7, %xmm7
318; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
319; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
320; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,3,3,4,5,6,7]
321; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm1[0,1,0,2,4,5,6,7]
322; SSE-NEXT:    packuswb %xmm4, %xmm6
323; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
324; SSE-NEXT:    movdqa %xmm1, %xmm4
325; SSE-NEXT:    pandn %xmm6, %xmm4
326; SSE-NEXT:    movdqa %xmm0, %xmm8
327; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
328; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm8[0,2,0,0]
329; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
330; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[2,1,1,3]
331; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[1,1,1,1,4,5,6,7]
332; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,6]
333; SSE-NEXT:    packuswb %xmm8, %xmm6
334; SSE-NEXT:    pand %xmm1, %xmm6
335; SSE-NEXT:    por %xmm4, %xmm6
336; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
337; SSE-NEXT:    pand %xmm4, %xmm6
338; SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
339; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[0,0,2,3]
340; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
341; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[0,1,2,0]
342; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7]
343; SSE-NEXT:    packuswb %xmm7, %xmm8
344; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255]
345; SSE-NEXT:    pand %xmm7, %xmm8
346; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm2[0,0,0,0,4,5,6,7]
347; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
348; SSE-NEXT:    pandn %xmm9, %xmm7
349; SSE-NEXT:    por %xmm8, %xmm7
350; SSE-NEXT:    pandn %xmm7, %xmm4
351; SSE-NEXT:    por %xmm6, %xmm4
352; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
353; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm5[0,3,1,3,4,5,6,7]
354; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,1,2,0,4,5,6,7]
355; SSE-NEXT:    packuswb %xmm6, %xmm5
356; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[3,1,2,1]
357; SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
358; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[2,0,2,3,4,5,6,7]
359; SSE-NEXT:    packuswb %xmm3, %xmm6
360; SSE-NEXT:    pand %xmm1, %xmm6
361; SSE-NEXT:    pandn %xmm5, %xmm1
362; SSE-NEXT:    por %xmm6, %xmm1
363; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,0,0,0,255,255,255,255,0,255,255,255,255]
364; SSE-NEXT:    pand %xmm3, %xmm1
365; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7]
366; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
367; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255]
368; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
369; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
370; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
371; SSE-NEXT:    pand %xmm5, %xmm0
372; SSE-NEXT:    pandn %xmm2, %xmm5
373; SSE-NEXT:    por %xmm0, %xmm5
374; SSE-NEXT:    pandn %xmm5, %xmm3
375; SSE-NEXT:    por %xmm1, %xmm3
376; SSE-NEXT:    movq %xmm3, 16(%rax)
377; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
378; SSE-NEXT:    movd %xmm0, 24(%rax)
379; SSE-NEXT:    movdqa %xmm4, (%rax)
380; SSE-NEXT:    retq
381;
382; AVX-LABEL: store_i8_stride7_vf4:
383; AVX:       # %bb.0:
384; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
385; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r10
386; AVX-NEXT:    vmovdqa (%rdi), %xmm0
387; AVX-NEXT:    vmovdqa (%rdx), %xmm1
388; AVX-NEXT:    vmovdqa (%r8), %xmm2
389; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
390; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
391; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
392; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
393; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
394; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,4,8,12],zero,zero,zero,xmm0[1,5,9,13],zero,zero,zero,xmm0[2,6]
395; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[0,4,8],zero,zero,zero,zero,xmm1[1,5,9],zero,zero
396; AVX-NEXT:    vpor %xmm3, %xmm2, %xmm2
397; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,14],zero,zero,zero,xmm0[3,7,11,15],zero,zero,zero,xmm0[u,u,u,u]
398; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[2,6,10],zero,zero,zero,zero,xmm1[3,7,11,u,u,u,u]
399; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
400; AVX-NEXT:    vpextrd $2, %xmm0, 24(%rax)
401; AVX-NEXT:    vmovq %xmm0, 16(%rax)
402; AVX-NEXT:    vmovdqa %xmm2, (%rax)
403; AVX-NEXT:    retq
404;
405; AVX2-LABEL: store_i8_stride7_vf4:
406; AVX2:       # %bb.0:
407; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
408; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
409; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
410; AVX2-NEXT:    vmovdqa (%rsi), %xmm1
411; AVX2-NEXT:    vmovdqa (%rdx), %xmm2
412; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
413; AVX2-NEXT:    vinserti128 $1, (%r9), %ymm1, %ymm1
414; AVX2-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
415; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
416; AVX2-NEXT:    vinserti128 $1, (%r10), %ymm2, %ymm1
417; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
418; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero
419; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
420; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero
421; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
422; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
423; AVX2-NEXT:    vpextrd $2, %xmm1, 24(%rax)
424; AVX2-NEXT:    vmovq %xmm1, 16(%rax)
425; AVX2-NEXT:    vmovdqa %xmm0, (%rax)
426; AVX2-NEXT:    vzeroupper
427; AVX2-NEXT:    retq
428;
429; AVX2-FP-LABEL: store_i8_stride7_vf4:
430; AVX2-FP:       # %bb.0:
431; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
432; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
433; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
434; AVX2-FP-NEXT:    vmovdqa (%rsi), %xmm1
435; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm2
436; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
437; AVX2-FP-NEXT:    vinserti128 $1, (%r9), %ymm1, %ymm1
438; AVX2-FP-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
439; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
440; AVX2-FP-NEXT:    vinserti128 $1, (%r10), %ymm2, %ymm1
441; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
442; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero
443; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
444; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero
445; AVX2-FP-NEXT:    vpor %ymm0, %ymm1, %ymm0
446; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm1
447; AVX2-FP-NEXT:    vpextrd $2, %xmm1, 24(%rax)
448; AVX2-FP-NEXT:    vmovq %xmm1, 16(%rax)
449; AVX2-FP-NEXT:    vmovdqa %xmm0, (%rax)
450; AVX2-FP-NEXT:    vzeroupper
451; AVX2-FP-NEXT:    retq
452;
453; AVX2-FCP-LABEL: store_i8_stride7_vf4:
454; AVX2-FCP:       # %bb.0:
455; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
456; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
457; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm0
458; AVX2-FCP-NEXT:    vmovdqa (%rsi), %xmm1
459; AVX2-FCP-NEXT:    vmovdqa (%rdx), %xmm2
460; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
461; AVX2-FCP-NEXT:    vinserti128 $1, (%r9), %ymm1, %ymm1
462; AVX2-FCP-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
463; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
464; AVX2-FCP-NEXT:    vinserti128 $1, (%r10), %ymm2, %ymm1
465; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
466; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero
467; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
468; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero
469; AVX2-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
470; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
471; AVX2-FCP-NEXT:    vpextrd $2, %xmm1, 24(%rax)
472; AVX2-FCP-NEXT:    vmovq %xmm1, 16(%rax)
473; AVX2-FCP-NEXT:    vmovdqa %xmm0, (%rax)
474; AVX2-FCP-NEXT:    vzeroupper
475; AVX2-FCP-NEXT:    retq
476;
477; AVX512-LABEL: store_i8_stride7_vf4:
478; AVX512:       # %bb.0:
479; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
480; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
481; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
482; AVX512-NEXT:    vmovdqa (%rsi), %xmm1
483; AVX512-NEXT:    vmovdqa (%rdx), %xmm2
484; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
485; AVX512-NEXT:    vinserti128 $1, (%r9), %ymm1, %ymm1
486; AVX512-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
487; AVX512-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
488; AVX512-NEXT:    vinserti128 $1, (%r10), %ymm2, %ymm1
489; AVX512-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
490; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27,u,u,u,u]
491; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
492; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero
493; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
494; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
495; AVX512-NEXT:    vpextrd $2, %xmm1, 24(%rax)
496; AVX512-NEXT:    vmovq %xmm1, 16(%rax)
497; AVX512-NEXT:    vmovdqa %xmm0, (%rax)
498; AVX512-NEXT:    vzeroupper
499; AVX512-NEXT:    retq
500;
501; AVX512-FCP-LABEL: store_i8_stride7_vf4:
502; AVX512-FCP:       # %bb.0:
503; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
504; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
505; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
506; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm1
507; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm2
508; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
509; AVX512-FCP-NEXT:    vinserti128 $1, (%r9), %ymm1, %ymm1
510; AVX512-FCP-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
511; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
512; AVX512-FCP-NEXT:    vinserti128 $1, (%r10), %ymm2, %ymm1
513; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
514; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27,u,u,u,u]
515; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
516; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero
517; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
518; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
519; AVX512-FCP-NEXT:    vpextrd $2, %xmm1, 24(%rax)
520; AVX512-FCP-NEXT:    vmovq %xmm1, 16(%rax)
521; AVX512-FCP-NEXT:    vmovdqa %xmm0, (%rax)
522; AVX512-FCP-NEXT:    vzeroupper
523; AVX512-FCP-NEXT:    retq
524;
525; AVX512DQ-LABEL: store_i8_stride7_vf4:
526; AVX512DQ:       # %bb.0:
527; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
528; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
529; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
530; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm1
531; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm2
532; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
533; AVX512DQ-NEXT:    vinserti128 $1, (%r9), %ymm1, %ymm1
534; AVX512DQ-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
535; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
536; AVX512DQ-NEXT:    vinserti128 $1, (%r10), %ymm2, %ymm1
537; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
538; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27,u,u,u,u]
539; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
540; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero
541; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
542; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
543; AVX512DQ-NEXT:    vpextrd $2, %xmm1, 24(%rax)
544; AVX512DQ-NEXT:    vmovq %xmm1, 16(%rax)
545; AVX512DQ-NEXT:    vmovdqa %xmm0, (%rax)
546; AVX512DQ-NEXT:    vzeroupper
547; AVX512DQ-NEXT:    retq
548;
549; AVX512DQ-FCP-LABEL: store_i8_stride7_vf4:
550; AVX512DQ-FCP:       # %bb.0:
551; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
552; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
553; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
554; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm1
555; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm2
556; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
557; AVX512DQ-FCP-NEXT:    vinserti128 $1, (%r9), %ymm1, %ymm1
558; AVX512DQ-FCP-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
559; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
560; AVX512DQ-FCP-NEXT:    vinserti128 $1, (%r10), %ymm2, %ymm1
561; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
562; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27,u,u,u,u]
563; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
564; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero
565; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
566; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
567; AVX512DQ-FCP-NEXT:    vpextrd $2, %xmm1, 24(%rax)
568; AVX512DQ-FCP-NEXT:    vmovq %xmm1, 16(%rax)
569; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%rax)
570; AVX512DQ-FCP-NEXT:    vzeroupper
571; AVX512DQ-FCP-NEXT:    retq
572;
573; AVX512BW-LABEL: store_i8_stride7_vf4:
574; AVX512BW:       # %bb.0:
575; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
576; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
577; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
578; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm1
579; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm2
580; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
581; AVX512BW-NEXT:    vinserti128 $1, (%r9), %ymm1, %ymm1
582; AVX512BW-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
583; AVX512BW-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
584; AVX512BW-NEXT:    vinserti128 $1, (%r10), %ymm2, %ymm1
585; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
586; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero
587; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
588; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero
589; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
590; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
591; AVX512BW-NEXT:    vpextrd $2, %xmm1, 24(%rax)
592; AVX512BW-NEXT:    vmovq %xmm1, 16(%rax)
593; AVX512BW-NEXT:    vmovdqa %xmm0, (%rax)
594; AVX512BW-NEXT:    vzeroupper
595; AVX512BW-NEXT:    retq
596;
597; AVX512BW-FCP-LABEL: store_i8_stride7_vf4:
598; AVX512BW-FCP:       # %bb.0:
599; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
600; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
601; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
602; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %xmm1
603; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %xmm2
604; AVX512BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
605; AVX512BW-FCP-NEXT:    vinserti128 $1, (%r9), %ymm1, %ymm1
606; AVX512BW-FCP-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
607; AVX512BW-FCP-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
608; AVX512BW-FCP-NEXT:    vinserti128 $1, (%r10), %ymm2, %ymm1
609; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
610; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero
611; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
612; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero
613; AVX512BW-FCP-NEXT:    vpor %ymm1, %ymm0, %ymm0
614; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
615; AVX512BW-FCP-NEXT:    vpextrd $2, %xmm1, 24(%rax)
616; AVX512BW-FCP-NEXT:    vmovq %xmm1, 16(%rax)
617; AVX512BW-FCP-NEXT:    vmovdqa %xmm0, (%rax)
618; AVX512BW-FCP-NEXT:    vzeroupper
619; AVX512BW-FCP-NEXT:    retq
620;
621; AVX512DQ-BW-LABEL: store_i8_stride7_vf4:
622; AVX512DQ-BW:       # %bb.0:
623; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
624; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
625; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
626; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %xmm1
627; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm2
628; AVX512DQ-BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
629; AVX512DQ-BW-NEXT:    vinserti128 $1, (%r9), %ymm1, %ymm1
630; AVX512DQ-BW-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
631; AVX512DQ-BW-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
632; AVX512DQ-BW-NEXT:    vinserti128 $1, (%r10), %ymm2, %ymm1
633; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
634; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero
635; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
636; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero
637; AVX512DQ-BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
638; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
639; AVX512DQ-BW-NEXT:    vpextrd $2, %xmm1, 24(%rax)
640; AVX512DQ-BW-NEXT:    vmovq %xmm1, 16(%rax)
641; AVX512DQ-BW-NEXT:    vmovdqa %xmm0, (%rax)
642; AVX512DQ-BW-NEXT:    vzeroupper
643; AVX512DQ-BW-NEXT:    retq
644;
645; AVX512DQ-BW-FCP-LABEL: store_i8_stride7_vf4:
646; AVX512DQ-BW-FCP:       # %bb.0:
647; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
648; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
649; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
650; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %xmm1
651; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %xmm2
652; AVX512DQ-BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
653; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, (%r9), %ymm1, %ymm1
654; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
655; AVX512DQ-BW-FCP-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
656; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, (%r10), %ymm2, %ymm1
657; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
658; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero
659; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
660; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero
661; AVX512DQ-BW-FCP-NEXT:    vpor %ymm1, %ymm0, %ymm0
662; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
663; AVX512DQ-BW-FCP-NEXT:    vpextrd $2, %xmm1, 24(%rax)
664; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm1, 16(%rax)
665; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm0, (%rax)
666; AVX512DQ-BW-FCP-NEXT:    vzeroupper
667; AVX512DQ-BW-FCP-NEXT:    retq
668  %in.vec0 = load <4 x i8>, ptr %in.vecptr0, align 64
669  %in.vec1 = load <4 x i8>, ptr %in.vecptr1, align 64
670  %in.vec2 = load <4 x i8>, ptr %in.vecptr2, align 64
671  %in.vec3 = load <4 x i8>, ptr %in.vecptr3, align 64
672  %in.vec4 = load <4 x i8>, ptr %in.vecptr4, align 64
673  %in.vec5 = load <4 x i8>, ptr %in.vecptr5, align 64
674  %in.vec6 = load <4 x i8>, ptr %in.vecptr6, align 64
675  %1 = shufflevector <4 x i8> %in.vec0, <4 x i8> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
676  %2 = shufflevector <4 x i8> %in.vec2, <4 x i8> %in.vec3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
677  %3 = shufflevector <4 x i8> %in.vec4, <4 x i8> %in.vec5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
678  %4 = shufflevector <8 x i8> %1, <8 x i8> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
679  %5 = shufflevector <4 x i8> %in.vec6, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
680  %6 = shufflevector <8 x i8> %3, <8 x i8> %5, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
681  %7 = shufflevector <12 x i8> %6, <12 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
682  %8 = shufflevector <16 x i8> %4, <16 x i8> %7, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
683  %interleaved.vec = shufflevector <28 x i8> %8, <28 x i8> poison, <28 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27>
684  store <28 x i8> %interleaved.vec, ptr %out.vec, align 64
685  ret void
686}
687
688define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind {
689; SSE-LABEL: store_i8_stride7_vf8:
690; SSE:       # %bb.0:
691; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
692; SSE-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
693; SSE-NEXT:    movq {{.*#+}} xmm10 = mem[0],zero
694; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
695; SSE-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
696; SSE-NEXT:    movq {{.*#+}} xmm4 = mem[0],zero
697; SSE-NEXT:    movq {{.*#+}} xmm14 = mem[0],zero
698; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
699; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
700; SSE-NEXT:    movq {{.*#+}} xmm5 = mem[0],zero
701; SSE-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0]
702; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7]
703; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
704; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
705; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
706; SSE-NEXT:    pand %xmm6, %xmm0
707; SSE-NEXT:    movdqa %xmm4, %xmm7
708; SSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
709; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7]
710; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,1,3]
711; SSE-NEXT:    pandn %xmm8, %xmm6
712; SSE-NEXT:    por %xmm0, %xmm6
713; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
714; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7]
715; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
716; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0]
717; SSE-NEXT:    pand %xmm8, %xmm0
718; SSE-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
719; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm10[0,2,1,3,4,5,6,7]
720; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[0,1,1,0]
721; SSE-NEXT:    movdqa %xmm8, %xmm12
722; SSE-NEXT:    pandn %xmm11, %xmm12
723; SSE-NEXT:    por %xmm0, %xmm12
724; SSE-NEXT:    pand %xmm9, %xmm12
725; SSE-NEXT:    pandn %xmm6, %xmm9
726; SSE-NEXT:    por %xmm12, %xmm9
727; SSE-NEXT:    pxor %xmm0, %xmm0
728; SSE-NEXT:    movdqa %xmm5, %xmm12
729; SSE-NEXT:    movdqa %xmm5, %xmm15
730; SSE-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15]
731; SSE-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
732; SSE-NEXT:    movdqa %xmm12, %xmm13
733; SSE-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
734; SSE-NEXT:    movdqa %xmm13, %xmm0
735; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
736; SSE-NEXT:    movdqa %xmm13, %xmm6
737; SSE-NEXT:    packuswb %xmm0, %xmm6
738; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255]
739; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm14[0,0,0,0,4,5,6,7]
740; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
741; SSE-NEXT:    movdqa %xmm11, %xmm14
742; SSE-NEXT:    pandn %xmm0, %xmm14
743; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,3]
744; SSE-NEXT:    pand %xmm11, %xmm0
745; SSE-NEXT:    por %xmm0, %xmm14
746; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
747; SSE-NEXT:    pand %xmm6, %xmm9
748; SSE-NEXT:    pandn %xmm14, %xmm6
749; SSE-NEXT:    por %xmm9, %xmm6
750; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3]
751; SSE-NEXT:    movdqa %xmm11, %xmm9
752; SSE-NEXT:    pandn %xmm0, %xmm9
753; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[2,1,3,3,4,5,6,7]
754; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
755; SSE-NEXT:    pand %xmm11, %xmm0
756; SSE-NEXT:    por %xmm9, %xmm0
757; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
758; SSE-NEXT:    movdqa %xmm1, %xmm9
759; SSE-NEXT:    pandn %xmm0, %xmm9
760; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,5,6,6,7]
761; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
762; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255]
763; SSE-NEXT:    movdqa %xmm0, %xmm14
764; SSE-NEXT:    pandn %xmm2, %xmm14
765; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7]
766; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
767; SSE-NEXT:    pand %xmm0, %xmm2
768; SSE-NEXT:    por %xmm2, %xmm14
769; SSE-NEXT:    pand %xmm1, %xmm14
770; SSE-NEXT:    por %xmm9, %xmm14
771; SSE-NEXT:    movdqa %xmm15, %xmm2
772; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
773; SSE-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
774; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7]
775; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4]
776; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm12[0,1,2,2,4,5,6,7]
777; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
778; SSE-NEXT:    packuswb %xmm2, %xmm9
779; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
780; SSE-NEXT:    pand %xmm12, %xmm9
781; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
782; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm15[2,2,2,2,4,5,6,7]
783; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
784; SSE-NEXT:    pandn %xmm2, %xmm12
785; SSE-NEXT:    por %xmm9, %xmm12
786; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255]
787; SSE-NEXT:    pand %xmm2, %xmm12
788; SSE-NEXT:    pandn %xmm14, %xmm2
789; SSE-NEXT:    por %xmm2, %xmm12
790; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm13[2,2,3,3]
791; SSE-NEXT:    psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
792; SSE-NEXT:    packuswb %xmm13, %xmm2
793; SSE-NEXT:    pand %xmm0, %xmm2
794; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm15[1,1,1,1,4,5,6,7]
795; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
796; SSE-NEXT:    pandn %xmm9, %xmm0
797; SSE-NEXT:    por %xmm2, %xmm0
798; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm10[0,1,2,2]
799; SSE-NEXT:    movdqa %xmm3, %xmm10
800; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm3[1,1,2,3,4,5,6,7]
801; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,0,2,1]
802; SSE-NEXT:    pand %xmm11, %xmm9
803; SSE-NEXT:    pandn %xmm2, %xmm11
804; SSE-NEXT:    por %xmm9, %xmm11
805; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
806; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm3[1,1,2,2,4,5,6,7]
807; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1]
808; SSE-NEXT:    pand %xmm8, %xmm2
809; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,2,1]
810; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,4]
811; SSE-NEXT:    pandn %xmm7, %xmm8
812; SSE-NEXT:    por %xmm2, %xmm8
813; SSE-NEXT:    pand %xmm1, %xmm8
814; SSE-NEXT:    pandn %xmm11, %xmm1
815; SSE-NEXT:    por %xmm8, %xmm1
816; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
817; SSE-NEXT:    pand %xmm2, %xmm1
818; SSE-NEXT:    pandn %xmm0, %xmm2
819; SSE-NEXT:    por %xmm2, %xmm1
820; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
821; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7]
822; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
823; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
824; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
825; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255]
826; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7]
827; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2]
828; SSE-NEXT:    pand %xmm0, %xmm4
829; SSE-NEXT:    pandn %xmm2, %xmm0
830; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[3,1,2,3]
831; SSE-NEXT:    por %xmm4, %xmm0
832; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm15[3,3,3,3,4,5,6,7]
833; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255]
834; SSE-NEXT:    pand %xmm4, %xmm3
835; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
836; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
837; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
838; SSE-NEXT:    pandn %xmm2, %xmm4
839; SSE-NEXT:    por %xmm3, %xmm4
840; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255]
841; SSE-NEXT:    pand %xmm2, %xmm0
842; SSE-NEXT:    pandn %xmm4, %xmm2
843; SSE-NEXT:    por %xmm0, %xmm2
844; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
845; SSE-NEXT:    movq %xmm2, 48(%rax)
846; SSE-NEXT:    movdqa %xmm1, 16(%rax)
847; SSE-NEXT:    movdqa %xmm12, 32(%rax)
848; SSE-NEXT:    movdqa %xmm6, (%rax)
849; SSE-NEXT:    retq
850;
851; AVX-LABEL: store_i8_stride7_vf8:
852; AVX:       # %bb.0:
853; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
854; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r10
855; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
856; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
857; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
858; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
859; AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
860; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
861; AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
862; AVX-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
863; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
864; AVX-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
865; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[u,u,u],zero,zero,xmm2[5,13,u,u,u],zero,zero,xmm2[6,14,u,u]
866; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,5,13],zero,zero,xmm1[u,u,u,6,14],zero,zero,xmm1[u,u]
867; AVX-NEXT:    vpor %xmm4, %xmm5, %xmm4
868; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[4,12],zero,xmm0[u,u,u,u,5,13],zero,xmm0[u,u,u,u,6,14]
869; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm3[4,u,u,u,u],zero,zero,xmm3[5,u,u,u,u],zero,zero
870; AVX-NEXT:    vpor %xmm6, %xmm5, %xmm5
871; AVX-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255]
872; AVX-NEXT:    vpblendvb %xmm6, %xmm5, %xmm4, %xmm4
873; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[u],zero,zero,xmm2[7,15,u,u,u,u,u,u,u,u,u,u,u]
874; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[u,7,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
875; AVX-NEXT:    vpor %xmm5, %xmm6, %xmm5
876; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = zero,xmm0[u,u,u,u,7,15],zero,xmm0[u,u,u,u,u,u,u,u]
877; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[6,u,u,u,u],zero,zero,xmm3[7,u,u,u,u,u,u,u,u]
878; AVX-NEXT:    vpor %xmm6, %xmm7, %xmm6
879; AVX-NEXT:    vmovq {{.*#+}} xmm7 = [0,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
880; AVX-NEXT:    vpblendvb %xmm7, %xmm5, %xmm6, %xmm5
881; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm2[0,8,u,u,u],zero,zero,xmm2[1,9,u,u,u],zero,zero
882; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[0,8],zero,zero,xmm1[u,u,u,1,9],zero,zero,xmm1[u,u,u,2,10]
883; AVX-NEXT:    vpor %xmm6, %xmm7, %xmm6
884; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,0,8],zero,xmm0[u,u,u,u,1,9],zero,xmm0[u,u]
885; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u],zero,zero,xmm3[0,u,u,u,u],zero,zero,xmm3[1,u,u]
886; AVX-NEXT:    vpor %xmm7, %xmm8, %xmm7
887; AVX-NEXT:    vmovdqa {{.*#+}} xmm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
888; AVX-NEXT:    vpblendvb %xmm8, %xmm6, %xmm7, %xmm6
889; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[u,u,u,3,11],zero,zero,xmm1[u,u,u,4,12],zero,zero
890; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,10,u,u,u],zero,zero,xmm2[3,11,u,u,u],zero,zero,xmm2[4,12]
891; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
892; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,2,10],zero,xmm0[u,u,u,u,3,11],zero,xmm0[u,u,u,u]
893; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[u,u],zero,zero,xmm3[2,u,u,u,u],zero,zero,xmm3[3,u,u,u,u]
894; AVX-NEXT:    vpor %xmm2, %xmm0, %xmm0
895; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
896; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
897; AVX-NEXT:    vmovdqa %xmm0, 16(%rax)
898; AVX-NEXT:    vmovdqa %xmm6, (%rax)
899; AVX-NEXT:    vmovq %xmm5, 48(%rax)
900; AVX-NEXT:    vmovdqa %xmm4, 32(%rax)
901; AVX-NEXT:    retq
902;
903; AVX2-LABEL: store_i8_stride7_vf8:
904; AVX2:       # %bb.0:
905; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
906; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
907; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
908; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
909; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
910; AVX2-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
911; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
912; AVX2-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
913; AVX2-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
914; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
915; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
916; AVX2-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
917; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
918; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
919; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3]
920; AVX2-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm2[5,13],zero,zero,zero,zero,zero,ymm2[6,14],zero,zero,zero,zero,zero,zero,zero,ymm2[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
921; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
922; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,ymm4[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
923; AVX2-NEXT:    vpor %ymm5, %ymm3, %ymm3
924; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2]
925; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
926; AVX2-NEXT:    vpshufb {{.*#+}} ymm6 = ymm1[4,12],zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,ymm1[6,14,22],zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero
927; AVX2-NEXT:    vpor %ymm5, %ymm6, %ymm5
928; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u]
929; AVX2-NEXT:    vpblendvb %ymm6, %ymm3, %ymm5, %ymm3
930; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28]
931; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28],zero,zero
932; AVX2-NEXT:    vpor %ymm4, %ymm2, %ymm2
933; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero
934; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,ymm1[18],zero,zero,zero,zero,zero,zero,ymm1[19],zero,zero,zero,zero
935; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
936; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
937; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
938; AVX2-NEXT:    vmovdqa %ymm0, (%rax)
939; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm0
940; AVX2-NEXT:    vmovq %xmm0, 48(%rax)
941; AVX2-NEXT:    vmovdqa %xmm3, 32(%rax)
942; AVX2-NEXT:    vzeroupper
943; AVX2-NEXT:    retq
944;
945; AVX2-FP-LABEL: store_i8_stride7_vf8:
946; AVX2-FP:       # %bb.0:
947; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
948; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
949; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
950; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
951; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
952; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
953; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
954; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
955; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
956; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
957; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
958; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
959; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
960; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
961; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3]
962; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm2[5,13],zero,zero,zero,zero,zero,ymm2[6,14],zero,zero,zero,zero,zero,zero,zero,ymm2[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
963; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
964; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,ymm4[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
965; AVX2-FP-NEXT:    vpor %ymm5, %ymm3, %ymm3
966; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2]
967; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
968; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm1[4,12],zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,ymm1[6,14,22],zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero
969; AVX2-FP-NEXT:    vpor %ymm5, %ymm6, %ymm5
970; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u]
971; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm3, %ymm5, %ymm3
972; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28]
973; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28],zero,zero
974; AVX2-FP-NEXT:    vpor %ymm4, %ymm2, %ymm2
975; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero
976; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,ymm1[18],zero,zero,zero,zero,zero,zero,ymm1[19],zero,zero,zero,zero
977; AVX2-FP-NEXT:    vpor %ymm0, %ymm1, %ymm0
978; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
979; AVX2-FP-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
980; AVX2-FP-NEXT:    vmovdqa %ymm0, (%rax)
981; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm0
982; AVX2-FP-NEXT:    vmovq %xmm0, 48(%rax)
983; AVX2-FP-NEXT:    vmovdqa %xmm3, 32(%rax)
984; AVX2-FP-NEXT:    vzeroupper
985; AVX2-FP-NEXT:    retq
986;
987; AVX2-FCP-LABEL: store_i8_stride7_vf8:
988; AVX2-FCP:       # %bb.0:
989; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
990; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
991; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
992; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
993; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
994; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
995; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
996; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
997; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
998; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
999; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
1000; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
1001; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1002; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
1003; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1004; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7]
1005; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
1006; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm1
1007; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[1,5,9,13],zero,zero,zero,ymm1[2,6,10,14],zero,zero,zero,ymm1[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1008; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0]
1009; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm3, %ymm3
1010; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[0,4,8],zero,zero,zero,zero,ymm3[1,5,9],zero,zero,zero,zero,ymm3[2,6,18],zero,zero,zero,zero,ymm3[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
1011; AVX2-FCP-NEXT:    vpor %ymm3, %ymm1, %ymm1
1012; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28]
1013; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
1014; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero
1015; AVX2-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
1016; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6]
1017; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
1018; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm3, %ymm0
1019; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u]
1020; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
1021; AVX2-FCP-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
1022; AVX2-FCP-NEXT:    vmovdqa %ymm0, (%rax)
1023; AVX2-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm0
1024; AVX2-FCP-NEXT:    vmovq %xmm0, 48(%rax)
1025; AVX2-FCP-NEXT:    vmovdqa %xmm1, 32(%rax)
1026; AVX2-FCP-NEXT:    vzeroupper
1027; AVX2-FCP-NEXT:    retq
1028;
1029; AVX512-LABEL: store_i8_stride7_vf8:
1030; AVX512:       # %bb.0:
1031; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1032; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1033; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
1034; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1035; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
1036; AVX512-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
1037; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
1038; AVX512-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
1039; AVX512-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
1040; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
1041; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
1042; AVX512-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
1043; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1044; AVX512-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
1045; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3]
1046; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm2[19,27,u,u,u],zero,zero,ymm2[20,28]
1047; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,5,13],zero,zero,ymm2[u,u,u,6,14],zero,zero,ymm2[u,u,u],zero,zero,ymm2[23,31,u,u,u,u,u,u,u,u,u,u,u]
1048; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
1049; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
1050; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,ymm2[0,8,u,u,u],zero,zero,ymm2[1,9,u,u,u],zero,zero,zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero
1051; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u],zero,zero,ymm2[5,13,u,u,u],zero,zero,ymm2[6,14,u,u,u,23,31],zero,zero,ymm2[u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
1052; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm4, %zmm2
1053; AVX512-NEXT:    vporq %zmm2, %zmm3, %zmm2
1054; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2]
1055; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u],zero,zero,ymm0[0,u,u,u,u],zero,zero,ymm0[1,u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u]
1056; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[4,u,u,u,u],zero,zero,ymm0[5,u,u,u,u],zero,zero,zero,ymm0[u,u,u,u,23,31],zero,ymm0[u,u,u,u,u,u,u,u]
1057; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0
1058; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,0,8],zero,ymm1[u,u,u,u,1,9],zero,ymm1[u,u,u,u],zero,zero,ymm1[18,u,u,u,u],zero,zero,ymm1[19,u,u,u,u]
1059; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[4,12],zero,ymm1[u,u,u,u,5,13],zero,ymm1[u,u,u,u,6,14,22,u,u,u,u],zero,zero,ymm1[23,u,u,u,u,u,u,u,u]
1060; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
1061; AVX512-NEXT:    vporq %zmm0, %zmm1, %zmm0
1062; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
1063; AVX512-NEXT:    vextracti32x4 $2, %zmm0, 32(%rax)
1064; AVX512-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
1065; AVX512-NEXT:    vmovq %xmm1, 48(%rax)
1066; AVX512-NEXT:    vmovdqa %ymm0, (%rax)
1067; AVX512-NEXT:    vzeroupper
1068; AVX512-NEXT:    retq
1069;
1070; AVX512-FCP-LABEL: store_i8_stride7_vf8:
1071; AVX512-FCP:       # %bb.0:
1072; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1073; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1074; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
1075; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1076; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
1077; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
1078; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
1079; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
1080; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
1081; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
1082; AVX512-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
1083; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
1084; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1085; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
1086; AVX512-FCP-NEXT:    vpermi2q %ymm3, %ymm0, %ymm1
1087; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6]
1088; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
1089; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
1090; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero
1091; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0]
1092; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm1
1093; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
1094; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1095; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm2[19,27,u,u,u],zero,zero,ymm2[20,28]
1096; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
1097; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u,u],zero,zero,ymm3[1,9,u,u,u],zero,zero,zero,zero,ymm3[u,u,u,19,27],zero,zero,ymm3[u,u,u,20,28],zero,zero
1098; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = mem & (ymm3 | ymm1)
1099; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7]
1100; AVX512-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
1101; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm1
1102; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[1,5,9,13],zero,zero,zero,ymm1[2,6,10,14],zero,zero,zero,ymm1[19,23,27,31],zero,zero,zero,ymm1[u,u,u,u,u,u,u,u]
1103; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
1104; AVX512-FCP-NEXT:    vporq %zmm0, %zmm1, %zmm0
1105; AVX512-FCP-NEXT:    vextracti32x4 $2, %zmm0, 32(%rax)
1106; AVX512-FCP-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
1107; AVX512-FCP-NEXT:    vmovq %xmm1, 48(%rax)
1108; AVX512-FCP-NEXT:    vmovdqa %ymm0, (%rax)
1109; AVX512-FCP-NEXT:    vzeroupper
1110; AVX512-FCP-NEXT:    retq
1111;
1112; AVX512DQ-LABEL: store_i8_stride7_vf8:
1113; AVX512DQ:       # %bb.0:
1114; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1115; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1116; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
1117; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1118; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
1119; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
1120; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
1121; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
1122; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
1123; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
1124; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
1125; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
1126; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1127; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
1128; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3]
1129; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm2[19,27,u,u,u],zero,zero,ymm2[20,28]
1130; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,5,13],zero,zero,ymm2[u,u,u,6,14],zero,zero,ymm2[u,u,u],zero,zero,ymm2[23,31,u,u,u,u,u,u,u,u,u,u,u]
1131; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
1132; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
1133; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,ymm2[0,8,u,u,u],zero,zero,ymm2[1,9,u,u,u],zero,zero,zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero
1134; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u],zero,zero,ymm2[5,13,u,u,u],zero,zero,ymm2[6,14,u,u,u,23,31],zero,zero,ymm2[u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
1135; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm4, %zmm2
1136; AVX512DQ-NEXT:    vporq %zmm2, %zmm3, %zmm2
1137; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2]
1138; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u],zero,zero,ymm0[0,u,u,u,u],zero,zero,ymm0[1,u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u]
1139; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[4,u,u,u,u],zero,zero,ymm0[5,u,u,u,u],zero,zero,zero,ymm0[u,u,u,u,23,31],zero,ymm0[u,u,u,u,u,u,u,u]
1140; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0
1141; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,0,8],zero,ymm1[u,u,u,u,1,9],zero,ymm1[u,u,u,u],zero,zero,ymm1[18,u,u,u,u],zero,zero,ymm1[19,u,u,u,u]
1142; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[4,12],zero,ymm1[u,u,u,u,5,13],zero,ymm1[u,u,u,u,6,14,22,u,u,u,u],zero,zero,ymm1[23,u,u,u,u,u,u,u,u]
1143; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
1144; AVX512DQ-NEXT:    vporq %zmm0, %zmm1, %zmm0
1145; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
1146; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm0, 32(%rax)
1147; AVX512DQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
1148; AVX512DQ-NEXT:    vmovq %xmm1, 48(%rax)
1149; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rax)
1150; AVX512DQ-NEXT:    vzeroupper
1151; AVX512DQ-NEXT:    retq
1152;
1153; AVX512DQ-FCP-LABEL: store_i8_stride7_vf8:
1154; AVX512DQ-FCP:       # %bb.0:
1155; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1156; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1157; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
1158; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1159; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
1160; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
1161; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
1162; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
1163; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
1164; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
1165; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
1166; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
1167; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1168; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
1169; AVX512DQ-FCP-NEXT:    vpermi2q %ymm3, %ymm0, %ymm1
1170; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6]
1171; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
1172; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm0, %ymm0
1173; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero
1174; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0]
1175; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm1
1176; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
1177; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1178; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm2[19,27,u,u,u],zero,zero,ymm2[20,28]
1179; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
1180; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u,u],zero,zero,ymm3[1,9,u,u,u],zero,zero,zero,zero,ymm3[u,u,u,19,27],zero,zero,ymm3[u,u,u,20,28],zero,zero
1181; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = mem & (ymm3 | ymm1)
1182; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7]
1183; AVX512DQ-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
1184; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm1
1185; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[1,5,9,13],zero,zero,zero,ymm1[2,6,10,14],zero,zero,zero,ymm1[19,23,27,31],zero,zero,zero,ymm1[u,u,u,u,u,u,u,u]
1186; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
1187; AVX512DQ-FCP-NEXT:    vporq %zmm0, %zmm1, %zmm0
1188; AVX512DQ-FCP-NEXT:    vextracti32x4 $2, %zmm0, 32(%rax)
1189; AVX512DQ-FCP-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
1190; AVX512DQ-FCP-NEXT:    vmovq %xmm1, 48(%rax)
1191; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, (%rax)
1192; AVX512DQ-FCP-NEXT:    vzeroupper
1193; AVX512DQ-FCP-NEXT:    retq
1194;
1195; AVX512BW-LABEL: store_i8_stride7_vf8:
1196; AVX512BW:       # %bb.0:
1197; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1198; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1199; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
1200; AVX512BW-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1201; AVX512BW-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
1202; AVX512BW-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
1203; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
1204; AVX512BW-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
1205; AVX512BW-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
1206; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
1207; AVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
1208; AVX512BW-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
1209; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1210; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
1211; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
1212; AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[0,2,1,3,4,6,5,7]
1213; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zmm1[18],zero,zero,zero,zero,zero,zero,zmm1[19],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46,54],zero,zero,zero,zero,zero,zero,zmm1[55],zero,zero,zero,zero,zero,zero,zero,zero
1214; AVX512BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6]
1215; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero
1216; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
1217; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm1
1218; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm2 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1219; AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
1220; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm1 = zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1221; AVX512BW-NEXT:    vporq %zmm2, %zmm1, %zmm1
1222; AVX512BW-NEXT:    movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870
1223; AVX512BW-NEXT:    kmovq %rcx, %k1
1224; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
1225; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, 32(%rax)
1226; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm0
1227; AVX512BW-NEXT:    vmovq %xmm0, 48(%rax)
1228; AVX512BW-NEXT:    vmovdqa %ymm1, (%rax)
1229; AVX512BW-NEXT:    vzeroupper
1230; AVX512BW-NEXT:    retq
1231;
1232; AVX512BW-FCP-LABEL: store_i8_stride7_vf8:
1233; AVX512BW-FCP:       # %bb.0:
1234; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1235; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1236; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
1237; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1238; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
1239; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
1240; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
1241; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
1242; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
1243; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
1244; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
1245; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
1246; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1247; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
1248; AVX512BW-FCP-NEXT:    vpermi2q %ymm3, %ymm0, %ymm1
1249; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28]
1250; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
1251; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8],zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,zero,ymm3[20,28],zero,zero
1252; AVX512BW-FCP-NEXT:    vpor %ymm0, %ymm3, %ymm0
1253; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6]
1254; AVX512BW-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
1255; AVX512BW-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm3
1256; AVX512BW-FCP-NEXT:    movl $236730480, %ecx # imm = 0xE1C3870
1257; AVX512BW-FCP-NEXT:    kmovd %ecx, %k1
1258; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u]
1259; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7]
1260; AVX512BW-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
1261; AVX512BW-FCP-NEXT:    vpermd %ymm2, %ymm3, %ymm2
1262; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm2[1,5,9,13],zero,zero,zero,ymm2[2,6,10,14],zero,zero,zero,ymm2[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1263; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0]
1264; AVX512BW-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm1
1265; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
1266; AVX512BW-FCP-NEXT:    vpor %ymm2, %ymm1, %ymm1
1267; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
1268; AVX512BW-FCP-NEXT:    vmovq %xmm2, 48(%rax)
1269; AVX512BW-FCP-NEXT:    vmovdqa %xmm1, 32(%rax)
1270; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, (%rax)
1271; AVX512BW-FCP-NEXT:    vzeroupper
1272; AVX512BW-FCP-NEXT:    retq
1273;
1274; AVX512DQ-BW-LABEL: store_i8_stride7_vf8:
1275; AVX512DQ-BW:       # %bb.0:
1276; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1277; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1278; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
1279; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1280; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
1281; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
1282; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
1283; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
1284; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
1285; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
1286; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
1287; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
1288; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1289; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
1290; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
1291; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[0,2,1,3,4,6,5,7]
1292; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zmm1[18],zero,zero,zero,zero,zero,zero,zmm1[19],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46,54],zero,zero,zero,zero,zero,zero,zmm1[55],zero,zero,zero,zero,zero,zero,zero,zero
1293; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6]
1294; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero
1295; AVX512DQ-BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
1296; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm1
1297; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm2 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1298; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
1299; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm1 = zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1300; AVX512DQ-BW-NEXT:    vporq %zmm2, %zmm1, %zmm1
1301; AVX512DQ-BW-NEXT:    movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870
1302; AVX512DQ-BW-NEXT:    kmovq %rcx, %k1
1303; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
1304; AVX512DQ-BW-NEXT:    vextracti32x4 $2, %zmm1, 32(%rax)
1305; AVX512DQ-BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm0
1306; AVX512DQ-BW-NEXT:    vmovq %xmm0, 48(%rax)
1307; AVX512DQ-BW-NEXT:    vmovdqa %ymm1, (%rax)
1308; AVX512DQ-BW-NEXT:    vzeroupper
1309; AVX512DQ-BW-NEXT:    retq
1310;
1311; AVX512DQ-BW-FCP-LABEL: store_i8_stride7_vf8:
1312; AVX512DQ-BW-FCP:       # %bb.0:
1313; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1314; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1315; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
1316; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1317; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
1318; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
1319; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
1320; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
1321; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
1322; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
1323; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
1324; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
1325; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1326; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
1327; AVX512DQ-BW-FCP-NEXT:    vpermi2q %ymm3, %ymm0, %ymm1
1328; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28]
1329; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
1330; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8],zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,zero,ymm3[20,28],zero,zero
1331; AVX512DQ-BW-FCP-NEXT:    vpor %ymm0, %ymm3, %ymm0
1332; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6]
1333; AVX512DQ-BW-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
1334; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm3
1335; AVX512DQ-BW-FCP-NEXT:    movl $236730480, %ecx # imm = 0xE1C3870
1336; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k1
1337; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u]
1338; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7]
1339; AVX512DQ-BW-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
1340; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm2, %ymm3, %ymm2
1341; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm2[1,5,9,13],zero,zero,zero,ymm2[2,6,10,14],zero,zero,zero,ymm2[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1342; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0]
1343; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm1
1344; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
1345; AVX512DQ-BW-FCP-NEXT:    vpor %ymm2, %ymm1, %ymm1
1346; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm2
1347; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm2, 48(%rax)
1348; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm1, 32(%rax)
1349; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, (%rax)
1350; AVX512DQ-BW-FCP-NEXT:    vzeroupper
1351; AVX512DQ-BW-FCP-NEXT:    retq
1352  %in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64
1353  %in.vec1 = load <8 x i8>, ptr %in.vecptr1, align 64
1354  %in.vec2 = load <8 x i8>, ptr %in.vecptr2, align 64
1355  %in.vec3 = load <8 x i8>, ptr %in.vecptr3, align 64
1356  %in.vec4 = load <8 x i8>, ptr %in.vecptr4, align 64
1357  %in.vec5 = load <8 x i8>, ptr %in.vecptr5, align 64
1358  %in.vec6 = load <8 x i8>, ptr %in.vecptr6, align 64
1359  %1 = shufflevector <8 x i8> %in.vec0, <8 x i8> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1360  %2 = shufflevector <8 x i8> %in.vec2, <8 x i8> %in.vec3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1361  %3 = shufflevector <8 x i8> %in.vec4, <8 x i8> %in.vec5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1362  %4 = shufflevector <16 x i8> %1, <16 x i8> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1363  %5 = shufflevector <8 x i8> %in.vec6, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1364  %6 = shufflevector <16 x i8> %3, <16 x i8> %5, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
1365  %7 = shufflevector <24 x i8> %6, <24 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1366  %8 = shufflevector <32 x i8> %4, <32 x i8> %7, <56 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
1367  %interleaved.vec = shufflevector <56 x i8> %8, <56 x i8> poison, <56 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55>
1368  store <56 x i8> %interleaved.vec, ptr %out.vec, align 64
1369  ret void
1370}
1371
1372define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind {
1373; SSE-LABEL: store_i8_stride7_vf16:
1374; SSE:       # %bb.0:
1375; SSE-NEXT:    subq $56, %rsp
1376; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1377; SSE-NEXT:    movdqa (%rdi), %xmm12
1378; SSE-NEXT:    movdqa (%rsi), %xmm4
1379; SSE-NEXT:    movdqa (%rdx), %xmm0
1380; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1381; SSE-NEXT:    movdqa (%rcx), %xmm5
1382; SSE-NEXT:    movdqa (%r8), %xmm7
1383; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1384; SSE-NEXT:    movdqa (%r9), %xmm8
1385; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
1386; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
1387; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255]
1388; SSE-NEXT:    pand %xmm13, %xmm0
1389; SSE-NEXT:    movdqa %xmm5, %xmm1
1390; SSE-NEXT:    movdqa %xmm5, %xmm6
1391; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1392; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
1393; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1394; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7]
1395; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1396; SSE-NEXT:    movdqa %xmm13, %xmm2
1397; SSE-NEXT:    pandn %xmm1, %xmm2
1398; SSE-NEXT:    por %xmm0, %xmm2
1399; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
1400; SSE-NEXT:    movdqa %xmm0, %xmm1
1401; SSE-NEXT:    pandn %xmm2, %xmm1
1402; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,7,7,7]
1403; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2]
1404; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
1405; SSE-NEXT:    pand %xmm10, %xmm3
1406; SSE-NEXT:    movdqa %xmm4, %xmm9
1407; SSE-NEXT:    movdqa %xmm4, %xmm5
1408; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1409; SSE-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15]
1410; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,4,6,6,7]
1411; SSE-NEXT:    movdqa %xmm9, (%rsp) # 16-byte Spill
1412; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3]
1413; SSE-NEXT:    pandn %xmm4, %xmm10
1414; SSE-NEXT:    por %xmm3, %xmm10
1415; SSE-NEXT:    pand %xmm0, %xmm10
1416; SSE-NEXT:    por %xmm1, %xmm10
1417; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,7,7,7,7]
1418; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
1419; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255]
1420; SSE-NEXT:    movdqa %xmm2, %xmm4
1421; SSE-NEXT:    pandn %xmm1, %xmm4
1422; SSE-NEXT:    movdqa %xmm8, %xmm1
1423; SSE-NEXT:    movdqa %xmm8, %xmm3
1424; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1425; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
1426; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1427; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,6,7]
1428; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,2,3]
1429; SSE-NEXT:    pand %xmm2, %xmm1
1430; SSE-NEXT:    por %xmm4, %xmm1
1431; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
1432; SSE-NEXT:    pand %xmm4, %xmm10
1433; SSE-NEXT:    pandn %xmm1, %xmm4
1434; SSE-NEXT:    movdqa (%rax), %xmm15
1435; SSE-NEXT:    por %xmm10, %xmm4
1436; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,6,7,7,7]
1437; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2]
1438; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0]
1439; SSE-NEXT:    movdqa %xmm11, %xmm7
1440; SSE-NEXT:    pandn %xmm1, %xmm7
1441; SSE-NEXT:    pand %xmm11, %xmm4
1442; SSE-NEXT:    por %xmm4, %xmm7
1443; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1444; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[2,1,2,3]
1445; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1446; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,0,3]
1447; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
1448; SSE-NEXT:    movdqa %xmm10, %xmm4
1449; SSE-NEXT:    pandn %xmm1, %xmm4
1450; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm12[3,3,3,3,4,5,6,7]
1451; SSE-NEXT:    movdqa %xmm12, %xmm5
1452; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
1453; SSE-NEXT:    pand %xmm10, %xmm1
1454; SSE-NEXT:    por %xmm1, %xmm4
1455; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[2,1,2,3]
1456; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1457; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,0,3]
1458; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7]
1459; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
1460; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255]
1461; SSE-NEXT:    movdqa %xmm12, %xmm14
1462; SSE-NEXT:    pandn %xmm1, %xmm14
1463; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1464; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7]
1465; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
1466; SSE-NEXT:    pand %xmm12, %xmm1
1467; SSE-NEXT:    por %xmm1, %xmm14
1468; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
1469; SSE-NEXT:    pand %xmm1, %xmm4
1470; SSE-NEXT:    pandn %xmm14, %xmm1
1471; SSE-NEXT:    por %xmm4, %xmm1
1472; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[2,1,2,3]
1473; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1474; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0]
1475; SSE-NEXT:    movdqa %xmm2, %xmm14
1476; SSE-NEXT:    pandn %xmm4, %xmm14
1477; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1478; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm6[3,3,3,3,4,5,6,7]
1479; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
1480; SSE-NEXT:    pand %xmm2, %xmm4
1481; SSE-NEXT:    por %xmm4, %xmm14
1482; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255]
1483; SSE-NEXT:    pand %xmm4, %xmm14
1484; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm15[3,3,3,3,4,5,6,7]
1485; SSE-NEXT:    movdqa %xmm15, %xmm3
1486; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
1487; SSE-NEXT:    pandn %xmm7, %xmm4
1488; SSE-NEXT:    por %xmm14, %xmm4
1489; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
1490; SSE-NEXT:    pand %xmm7, %xmm1
1491; SSE-NEXT:    pandn %xmm4, %xmm7
1492; SSE-NEXT:    por %xmm1, %xmm7
1493; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1494; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,4,5,5,7]
1495; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1496; SSE-NEXT:    movdqa %xmm13, %xmm4
1497; SSE-NEXT:    pandn %xmm1, %xmm4
1498; SSE-NEXT:    movdqa %xmm5, %xmm15
1499; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,6,6,6]
1500; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
1501; SSE-NEXT:    pand %xmm13, %xmm1
1502; SSE-NEXT:    por %xmm1, %xmm4
1503; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,6,6,6,6]
1504; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
1505; SSE-NEXT:    movdqa %xmm2, %xmm7
1506; SSE-NEXT:    pandn %xmm1, %xmm7
1507; SSE-NEXT:    pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1508; SSE-NEXT:    # xmm1 = mem[2,1,2,3]
1509; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7]
1510; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
1511; SSE-NEXT:    pand %xmm2, %xmm1
1512; SSE-NEXT:    por %xmm7, %xmm1
1513; SSE-NEXT:    pand %xmm0, %xmm1
1514; SSE-NEXT:    pandn %xmm4, %xmm0
1515; SSE-NEXT:    por %xmm1, %xmm0
1516; SSE-NEXT:    pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1517; SSE-NEXT:    # xmm1 = mem[1,1,2,3]
1518; SSE-NEXT:    movdqa %xmm10, %xmm4
1519; SSE-NEXT:    pandn %xmm1, %xmm4
1520; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,5,6,6,7]
1521; SSE-NEXT:    movdqa %xmm6, %xmm9
1522; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2]
1523; SSE-NEXT:    pand %xmm10, %xmm1
1524; SSE-NEXT:    por %xmm4, %xmm1
1525; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,6,6]
1526; SSE-NEXT:    movdqa %xmm3, %xmm6
1527; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3]
1528; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
1529; SSE-NEXT:    movdqa %xmm5, %xmm7
1530; SSE-NEXT:    pandn %xmm4, %xmm7
1531; SSE-NEXT:    pand %xmm5, %xmm1
1532; SSE-NEXT:    por %xmm1, %xmm7
1533; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
1534; SSE-NEXT:    pand %xmm1, %xmm0
1535; SSE-NEXT:    pandn %xmm7, %xmm1
1536; SSE-NEXT:    por %xmm0, %xmm1
1537; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1538; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1539; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1540; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1541; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
1542; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1543; SSE-NEXT:    movdqa %xmm5, %xmm1
1544; SSE-NEXT:    pandn %xmm0, %xmm1
1545; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm8[0,0,0,0,4,5,6,7]
1546; SSE-NEXT:    movdqa %xmm8, %xmm3
1547; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1548; SSE-NEXT:    pand %xmm5, %xmm0
1549; SSE-NEXT:    por %xmm0, %xmm1
1550; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1551; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1552; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1553; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
1554; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
1555; SSE-NEXT:    movdqa %xmm11, %xmm7
1556; SSE-NEXT:    pandn %xmm0, %xmm7
1557; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm15[0,0,2,1,4,5,6,7]
1558; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
1559; SSE-NEXT:    pand %xmm11, %xmm0
1560; SSE-NEXT:    por %xmm0, %xmm7
1561; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
1562; SSE-NEXT:    pand %xmm0, %xmm7
1563; SSE-NEXT:    pandn %xmm1, %xmm0
1564; SSE-NEXT:    por %xmm7, %xmm0
1565; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1566; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1567; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1568; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
1569; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1]
1570; SSE-NEXT:    movdqa %xmm13, %xmm7
1571; SSE-NEXT:    pandn %xmm1, %xmm7
1572; SSE-NEXT:    movdqa %xmm9, %xmm5
1573; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm9[0,0,0,0,4,5,6,7]
1574; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
1575; SSE-NEXT:    pand %xmm13, %xmm1
1576; SSE-NEXT:    por %xmm1, %xmm7
1577; SSE-NEXT:    movdqa %xmm6, %xmm8
1578; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm6[0,0,0,0,4,5,6,7]
1579; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
1580; SSE-NEXT:    movdqa %xmm2, %xmm9
1581; SSE-NEXT:    pandn %xmm1, %xmm9
1582; SSE-NEXT:    pand %xmm2, %xmm7
1583; SSE-NEXT:    por %xmm7, %xmm9
1584; SSE-NEXT:    movdqa {{.*#+}} xmm14 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
1585; SSE-NEXT:    pand %xmm14, %xmm0
1586; SSE-NEXT:    pandn %xmm9, %xmm14
1587; SSE-NEXT:    por %xmm0, %xmm14
1588; SSE-NEXT:    pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1589; SSE-NEXT:    # xmm0 = mem[0,1,1,3]
1590; SSE-NEXT:    movdqa %xmm10, %xmm7
1591; SSE-NEXT:    pandn %xmm0, %xmm7
1592; SSE-NEXT:    movdqa %xmm3, %xmm1
1593; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,5,7]
1594; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[2,1,3,2]
1595; SSE-NEXT:    pand %xmm10, %xmm9
1596; SSE-NEXT:    por %xmm7, %xmm9
1597; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,5,5,5,5]
1598; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
1599; SSE-NEXT:    movdqa %xmm2, %xmm7
1600; SSE-NEXT:    pandn %xmm0, %xmm7
1601; SSE-NEXT:    pshuflw $233, (%rsp), %xmm0 # 16-byte Folded Reload
1602; SSE-NEXT:    # xmm0 = mem[1,2,2,3,4,5,6,7]
1603; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,0,2,1]
1604; SSE-NEXT:    pand %xmm2, %xmm6
1605; SSE-NEXT:    por %xmm7, %xmm6
1606; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0]
1607; SSE-NEXT:    pand %xmm0, %xmm6
1608; SSE-NEXT:    pandn %xmm9, %xmm0
1609; SSE-NEXT:    por %xmm6, %xmm0
1610; SSE-NEXT:    pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
1611; SSE-NEXT:    # xmm3 = mem[1,2,2,3,4,5,6,7]
1612; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
1613; SSE-NEXT:    movdqa %xmm12, %xmm6
1614; SSE-NEXT:    pandn %xmm3, %xmm6
1615; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,6,5]
1616; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3]
1617; SSE-NEXT:    pand %xmm12, %xmm3
1618; SSE-NEXT:    por %xmm3, %xmm6
1619; SSE-NEXT:    pand %xmm13, %xmm6
1620; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,5,7]
1621; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
1622; SSE-NEXT:    pandn %xmm3, %xmm13
1623; SSE-NEXT:    por %xmm6, %xmm13
1624; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
1625; SSE-NEXT:    pand %xmm3, %xmm0
1626; SSE-NEXT:    pandn %xmm13, %xmm3
1627; SSE-NEXT:    por %xmm0, %xmm3
1628; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1629; SSE-NEXT:    # xmm0 = mem[2,2,3,3]
1630; SSE-NEXT:    movdqa %xmm2, %xmm6
1631; SSE-NEXT:    pandn %xmm0, %xmm6
1632; SSE-NEXT:    movdqa %xmm1, %xmm13
1633; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[2,1,3,3,4,5,6,7]
1634; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
1635; SSE-NEXT:    pand %xmm2, %xmm0
1636; SSE-NEXT:    por %xmm6, %xmm0
1637; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
1638; SSE-NEXT:    movdqa %xmm6, %xmm7
1639; SSE-NEXT:    pandn %xmm0, %xmm7
1640; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1641; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,6,6,7]
1642; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[2,2,2,2]
1643; SSE-NEXT:    movdqa %xmm12, %xmm0
1644; SSE-NEXT:    pandn %xmm9, %xmm0
1645; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm15[0,2,2,3,4,5,6,7]
1646; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,1,1,3]
1647; SSE-NEXT:    pand %xmm12, %xmm9
1648; SSE-NEXT:    por %xmm9, %xmm0
1649; SSE-NEXT:    pand %xmm6, %xmm0
1650; SSE-NEXT:    por %xmm7, %xmm0
1651; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1652; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,6,5,7]
1653; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[2,1,3,2]
1654; SSE-NEXT:    movdqa %xmm11, %xmm9
1655; SSE-NEXT:    pandn %xmm7, %xmm9
1656; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
1657; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm4[2,2,2,3,4,5,6,7]
1658; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1]
1659; SSE-NEXT:    pand %xmm11, %xmm7
1660; SSE-NEXT:    por %xmm7, %xmm9
1661; SSE-NEXT:    pand %xmm10, %xmm9
1662; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm8[2,2,2,2,4,5,6,7]
1663; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,0,0,0]
1664; SSE-NEXT:    pandn %xmm7, %xmm10
1665; SSE-NEXT:    por %xmm9, %xmm10
1666; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255]
1667; SSE-NEXT:    pand %xmm7, %xmm10
1668; SSE-NEXT:    pandn %xmm0, %xmm7
1669; SSE-NEXT:    por %xmm7, %xmm10
1670; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
1671; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm15[1,1,2,3,4,5,6,7]
1672; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1]
1673; SSE-NEXT:    pand %xmm2, %xmm7
1674; SSE-NEXT:    pandn %xmm0, %xmm2
1675; SSE-NEXT:    por %xmm7, %xmm2
1676; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm13[1,1,2,2,4,5,6,7]
1677; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
1678; SSE-NEXT:    pand %xmm11, %xmm0
1679; SSE-NEXT:    pshufd $101, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
1680; SSE-NEXT:    # xmm7 = mem[1,1,2,1]
1681; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,4]
1682; SSE-NEXT:    pandn %xmm7, %xmm11
1683; SSE-NEXT:    por %xmm0, %xmm11
1684; SSE-NEXT:    pand %xmm6, %xmm11
1685; SSE-NEXT:    pandn %xmm2, %xmm6
1686; SSE-NEXT:    por %xmm11, %xmm6
1687; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm4[1,1,1,1,4,5,6,7]
1688; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1689; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
1690; SSE-NEXT:    pand %xmm1, %xmm0
1691; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7]
1692; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
1693; SSE-NEXT:    pandn %xmm2, %xmm1
1694; SSE-NEXT:    por %xmm0, %xmm1
1695; SSE-NEXT:    pand %xmm12, %xmm1
1696; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm8[1,1,1,1,4,5,6,7]
1697; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1698; SSE-NEXT:    pandn %xmm0, %xmm12
1699; SSE-NEXT:    por %xmm1, %xmm12
1700; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
1701; SSE-NEXT:    pand %xmm0, %xmm6
1702; SSE-NEXT:    pandn %xmm12, %xmm0
1703; SSE-NEXT:    por %xmm6, %xmm0
1704; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1705; SSE-NEXT:    movdqa %xmm0, 16(%rax)
1706; SSE-NEXT:    movdqa %xmm10, 32(%rax)
1707; SSE-NEXT:    movdqa %xmm3, 64(%rax)
1708; SSE-NEXT:    movdqa %xmm14, (%rax)
1709; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1710; SSE-NEXT:    movaps %xmm0, 80(%rax)
1711; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1712; SSE-NEXT:    movaps %xmm0, 48(%rax)
1713; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1714; SSE-NEXT:    movaps %xmm0, 96(%rax)
1715; SSE-NEXT:    addq $56, %rsp
1716; SSE-NEXT:    retq
1717;
1718; AVX-LABEL: store_i8_stride7_vf16:
1719; AVX:       # %bb.0:
1720; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1721; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1722; AVX-NEXT:    vmovdqa (%rdi), %xmm3
1723; AVX-NEXT:    vmovdqa (%rsi), %xmm5
1724; AVX-NEXT:    vmovdqa (%rdx), %xmm6
1725; AVX-NEXT:    vmovdqa (%rcx), %xmm7
1726; AVX-NEXT:    vmovdqa (%r8), %xmm0
1727; AVX-NEXT:    vmovdqa (%r9), %xmm2
1728; AVX-NEXT:    vmovdqa (%r10), %xmm1
1729; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
1730; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm10[6,7],zero,zero,zero,zero,zero,xmm10[8,9],zero,zero
1731; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
1732; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm11[4,5],zero,zero,zero,zero,zero,xmm11[6,7],zero,zero,zero,zero,zero,xmm11[8,9]
1733; AVX-NEXT:    vpor %xmm4, %xmm8, %xmm4
1734; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm11[0,1],zero,zero,zero,zero,zero,xmm11[2,3],zero,zero,zero,zero,zero
1735; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = xmm10[0,1],zero,zero,zero,zero,zero,xmm10[2,3],zero,zero,zero,zero,zero,xmm10[4,5]
1736; AVX-NEXT:    vpor %xmm8, %xmm9, %xmm8
1737; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm8, %ymm4
1738; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1739; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm9[u,u,4,5],zero,xmm9[u,u,u,u,6,7],zero,xmm9[u,u,u,u]
1740; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm1[u,u],zero,zero,xmm1[2,u,u,u,u],zero,zero,xmm1[3,u,u,u,u]
1741; AVX-NEXT:    vpor %xmm12, %xmm8, %xmm8
1742; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u]
1743; AVX-NEXT:    vpalignr {{.*#+}} xmm12 = xmm12[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
1744; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u]
1745; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm12, %ymm8
1746; AVX-NEXT:    vmovaps {{.*#+}} ymm12 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
1747; AVX-NEXT:    vandnps %ymm8, %ymm12, %ymm8
1748; AVX-NEXT:    vorps %ymm4, %ymm8, %ymm4
1749; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm5[u],zero,xmm5[7,u,u,u,u,u],zero,xmm5[8,u,u,u,u,u],zero
1750; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm3[u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u,u,9]
1751; AVX-NEXT:    vpor %xmm8, %xmm12, %xmm12
1752; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[u,u,u],zero,xmm7[7,u,u,u,u,u],zero,xmm7[8,u,u,u,u]
1753; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,7],zero,xmm6[u,u,u,u,u,8],zero,xmm6[u,u,u,u]
1754; AVX-NEXT:    vpor %xmm8, %xmm13, %xmm13
1755; AVX-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
1756; AVX-NEXT:    vpblendvb %xmm8, %xmm12, %xmm13, %xmm12
1757; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,xmm11[10,11],zero,zero,zero,zero,zero,xmm11[12,13],zero,zero
1758; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[10,11],zero,zero,zero,zero,zero,xmm10[12,13],zero,zero,zero,zero
1759; AVX-NEXT:    vpor %xmm11, %xmm10, %xmm10
1760; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm10, %ymm10
1761; AVX-NEXT:    vmovaps {{.*#+}} ymm11 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
1762; AVX-NEXT:    vandps %ymm11, %ymm10, %ymm10
1763; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u],zero,xmm2[7,u,u,u,u,u],zero,xmm2[8,u,u]
1764; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,7],zero,xmm0[u,u,u,u,u,8],zero,xmm0[u,u]
1765; AVX-NEXT:    vpor %xmm12, %xmm13, %xmm12
1766; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = zero,xmm12[u,u,u,u,5,6],zero,xmm12[u,u,u,u,12,13],zero,xmm12[u]
1767; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm1[6,u,u,u,u],zero,zero,xmm1[7,u,u,u,u],zero,zero,xmm1[8,u]
1768; AVX-NEXT:    vpor %xmm13, %xmm12, %xmm12
1769; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[8,9],zero,xmm9[u,u,u,u,10,11],zero,xmm9[u,u,u,u,12,13]
1770; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,xmm1[4,u,u,u,u],zero,zero,xmm1[5,u,u,u,u],zero,zero
1771; AVX-NEXT:    vpor %xmm13, %xmm9, %xmm9
1772; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm9, %ymm9
1773; AVX-NEXT:    vandnps %ymm9, %ymm11, %ymm9
1774; AVX-NEXT:    vorps %ymm9, %ymm10, %ymm9
1775; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
1776; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,xmm10[8,9],zero,zero,zero,zero,zero,xmm10[10,11],zero,zero,zero
1777; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
1778; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[6],zero,zero,zero,zero,zero,xmm12[9,8],zero,zero,zero,zero,zero,xmm12[11,10],zero
1779; AVX-NEXT:    vpor %xmm11, %xmm12, %xmm11
1780; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
1781; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = zero,xmm6[2,3],zero,zero,zero,zero,zero,xmm6[4,5],zero,zero,zero,zero,zero,xmm6[6]
1782; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
1783; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2],zero,zero,zero,zero,zero,xmm3[5,4],zero,zero,zero,zero,zero,xmm3[7,6],zero
1784; AVX-NEXT:    vpor %xmm7, %xmm3, %xmm3
1785; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm3, %ymm3
1786; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1787; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm5[u,6,7],zero,xmm5[u,u,u,u,8,9],zero,xmm5[u,u,u,u,10]
1788; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm1[u],zero,zero,xmm1[11,u,u,u,u],zero,zero,xmm1[12,u,u,u,u],zero
1789; AVX-NEXT:    vpor %xmm7, %xmm11, %xmm7
1790; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,2,3],zero,xmm5[u,u,u,u,4,5],zero,xmm5[u,u,u]
1791; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm1[u,u,u],zero,zero,xmm1[9,u,u,u,u],zero,zero,xmm1[10,u,u,u]
1792; AVX-NEXT:    vpor %xmm5, %xmm11, %xmm5
1793; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm5, %ymm5
1794; AVX-NEXT:    vandnps %ymm5, %ymm8, %ymm5
1795; AVX-NEXT:    vorps %ymm5, %ymm3, %ymm3
1796; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm6[12,13],zero,zero,zero,zero,zero,xmm6[14,15],zero,zero,zero
1797; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm10[12,13],zero,zero,zero,zero,zero,xmm10[14,15],zero,zero,zero,zero,zero
1798; AVX-NEXT:    vpor %xmm5, %xmm6, %xmm5
1799; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
1800; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10],zero,xmm0[u,u,u,u,13,12],zero,xmm0[u,u,u,u,15,14],zero
1801; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[13,u,u,u,u],zero,zero,xmm1[14,u,u,u,u],zero,zero,xmm1[15]
1802; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
1803; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0]
1804; AVX-NEXT:    vpblendvb %xmm1, %xmm5, %xmm0, %xmm0
1805; AVX-NEXT:    vmovdqa %xmm0, 96(%rax)
1806; AVX-NEXT:    vmovaps %ymm3, 64(%rax)
1807; AVX-NEXT:    vmovaps %ymm9, 32(%rax)
1808; AVX-NEXT:    vmovaps %ymm4, (%rax)
1809; AVX-NEXT:    vzeroupper
1810; AVX-NEXT:    retq
1811;
1812; AVX2-LABEL: store_i8_stride7_vf16:
1813; AVX2:       # %bb.0:
1814; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1815; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1816; AVX2-NEXT:    vmovdqa (%rdi), %xmm3
1817; AVX2-NEXT:    vmovdqa (%rdx), %xmm4
1818; AVX2-NEXT:    vmovdqa (%r8), %xmm1
1819; AVX2-NEXT:    vmovdqa (%r9), %xmm2
1820; AVX2-NEXT:    vmovdqa (%r10), %xmm0
1821; AVX2-NEXT:    vinserti128 $1, (%rsi), %ymm3, %ymm3
1822; AVX2-NEXT:    vinserti128 $1, (%rcx), %ymm4, %ymm4
1823; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm6
1824; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm3[5],zero,zero,zero,zero,zero,zero,ymm3[6],zero,zero,zero,zero,zero,zero,zero,ymm3[23],zero,zero,zero,zero,zero,zero,ymm3[24],zero,zero,zero,zero,zero,zero
1825; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
1826; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25]
1827; AVX2-NEXT:    vpor %ymm5, %ymm7, %ymm5
1828; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,ymm4[5],zero,zero,zero,zero,zero,zero,ymm4[6],zero,zero,zero,zero,zero,zero,zero,ymm4[23],zero,zero,zero,zero,zero,zero,ymm4[24],zero,zero,zero,zero
1829; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm4[2,3,0,1]
1830; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero,zero,zero
1831; AVX2-NEXT:    vpor %ymm7, %ymm8, %ymm7
1832; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255]
1833; AVX2-NEXT:    vpblendvb %ymm8, %ymm5, %ymm7, %ymm5
1834; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero
1835; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
1836; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = zero,ymm8[4],zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero
1837; AVX2-NEXT:    vpor %ymm7, %ymm8, %ymm7
1838; AVX2-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
1839; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0]
1840; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u]
1841; AVX2-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
1842; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
1843; AVX2-NEXT:    vpblendvb %ymm8, %ymm5, %ymm7, %ymm5
1844; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2]
1845; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u]
1846; AVX2-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7]
1847; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,0]
1848; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0]
1849; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u]
1850; AVX2-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
1851; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm4[0,2,0,2]
1852; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,ymm8[0,8],zero,zero,zero,zero,zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[18,26],zero,zero,zero,zero,zero,ymm8[19,27],zero,zero,zero,zero,zero,ymm8[20,28]
1853; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm3[0,2,0,2]
1854; AVX2-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[0,8],zero,zero,zero,zero,zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[19,27],zero,zero,zero,zero,zero,ymm9[20,28],zero,zero
1855; AVX2-NEXT:    vpor %ymm8, %ymm9, %ymm8
1856; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
1857; AVX2-NEXT:    vpblendvb %ymm9, %ymm8, %ymm7, %ymm7
1858; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3]
1859; AVX2-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
1860; AVX2-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6]
1861; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3]
1862; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
1863; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255]
1864; AVX2-NEXT:    vpblendvb %ymm9, %ymm6, %ymm8, %ymm6
1865; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm3[3,1,1,3]
1866; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero
1867; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm4[1,3,3,1]
1868; AVX2-NEXT:    vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[3,19],zero,zero,zero,zero,zero,ymm9[28,20],zero,zero,zero,zero,zero,ymm9[29,21],zero
1869; AVX2-NEXT:    vpor %ymm8, %ymm9, %ymm8
1870; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
1871; AVX2-NEXT:    vpblendvb %ymm9, %ymm8, %ymm6, %ymm6
1872; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
1873; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero
1874; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3]
1875; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero,zero,zero
1876; AVX2-NEXT:    vpor %xmm4, %xmm3, %xmm3
1877; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1878; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u]
1879; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,7,7]
1880; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,2]
1881; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0]
1882; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1883; AVX2-NEXT:    vmovdqa %ymm6, 64(%rax)
1884; AVX2-NEXT:    vmovdqa %ymm7, (%rax)
1885; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0]
1886; AVX2-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
1887; AVX2-NEXT:    vmovdqa %xmm0, 96(%rax)
1888; AVX2-NEXT:    vmovdqa %ymm5, 32(%rax)
1889; AVX2-NEXT:    vzeroupper
1890; AVX2-NEXT:    retq
1891;
1892; AVX2-FP-LABEL: store_i8_stride7_vf16:
1893; AVX2-FP:       # %bb.0:
1894; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1895; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1896; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm2
1897; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm3
1898; AVX2-FP-NEXT:    vmovdqa (%r8), %xmm0
1899; AVX2-FP-NEXT:    vmovdqa (%r9), %xmm1
1900; AVX2-FP-NEXT:    vinserti128 $1, (%rsi), %ymm2, %ymm2
1901; AVX2-FP-NEXT:    vinserti128 $1, (%rcx), %ymm3, %ymm3
1902; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm6
1903; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero,zero,zero,zero,zero
1904; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1]
1905; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[5],zero,zero,zero,zero,zero,zero,ymm5[6],zero,zero,zero,zero,zero,ymm5[23],zero,zero,zero,zero,zero,zero,ymm5[24],zero,zero,zero,zero,zero,zero,ymm5[25]
1906; AVX2-FP-NEXT:    vpor %ymm4, %ymm5, %ymm4
1907; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm3[5],zero,zero,zero,zero,zero,zero,ymm3[6],zero,zero,zero,zero,zero,zero,zero,ymm3[23],zero,zero,zero,zero,zero,zero,ymm3[24],zero,zero,zero,zero
1908; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
1909; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero
1910; AVX2-FP-NEXT:    vpor %ymm7, %ymm5, %ymm5
1911; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255]
1912; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm4, %ymm5, %ymm5
1913; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero
1914; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
1915; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,ymm7[4],zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero
1916; AVX2-FP-NEXT:    vpor %ymm7, %ymm4, %ymm7
1917; AVX2-FP-NEXT:    vmovdqa (%r10), %xmm4
1918; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm4[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
1919; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0]
1920; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u]
1921; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
1922; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
1923; AVX2-FP-NEXT:    vpblendvb %ymm8, %ymm5, %ymm7, %ymm5
1924; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm4[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3]
1925; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0]
1926; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm6[0,2,0,2]
1927; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u]
1928; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u]
1929; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm8, %ymm7, %ymm7
1930; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm3[0,2,0,2]
1931; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,ymm8[0,8],zero,zero,zero,zero,zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[18,26],zero,zero,zero,zero,zero,ymm8[19,27],zero,zero,zero,zero,zero,ymm8[20,28]
1932; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2]
1933; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[0,8],zero,zero,zero,zero,zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[19,27],zero,zero,zero,zero,zero,ymm9[20,28],zero,zero
1934; AVX2-FP-NEXT:    vpor %ymm8, %ymm9, %ymm8
1935; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
1936; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm8, %ymm7, %ymm7
1937; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm4[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13]
1938; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
1939; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3]
1940; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
1941; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255]
1942; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm6, %ymm8, %ymm6
1943; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm2[3,1,1,3]
1944; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero
1945; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm3[1,3,3,1]
1946; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[3,19],zero,zero,zero,zero,zero,ymm9[28,20],zero,zero,zero,zero,zero,ymm9[29,21],zero
1947; AVX2-FP-NEXT:    vpor %ymm8, %ymm9, %ymm8
1948; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
1949; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm8, %ymm6, %ymm6
1950; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3]
1951; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero
1952; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
1953; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[6,14],zero,zero,zero,zero,zero,xmm2[7,15],zero,zero,zero,zero,zero
1954; AVX2-FP-NEXT:    vpor %xmm3, %xmm2, %xmm2
1955; AVX2-FP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1956; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10],zero,zero,zero,zero,zero,xmm0[13,12],zero,zero,zero,zero,zero,xmm0[15,14],zero
1957; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm4[13,14,15,4,5],zero,zero,xmm4[14,15,14,15,12],zero,zero,xmm4[15]
1958; AVX2-FP-NEXT:    vpor %xmm1, %xmm0, %xmm0
1959; AVX2-FP-NEXT:    vmovdqa %ymm6, 64(%rax)
1960; AVX2-FP-NEXT:    vmovdqa %ymm7, (%rax)
1961; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0]
1962; AVX2-FP-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1963; AVX2-FP-NEXT:    vmovdqa %xmm0, 96(%rax)
1964; AVX2-FP-NEXT:    vmovdqa %ymm5, 32(%rax)
1965; AVX2-FP-NEXT:    vzeroupper
1966; AVX2-FP-NEXT:    retq
1967;
1968; AVX2-FCP-LABEL: store_i8_stride7_vf16:
1969; AVX2-FCP:       # %bb.0:
1970; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1971; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1972; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm3
1973; AVX2-FCP-NEXT:    vmovdqa (%rdx), %xmm4
1974; AVX2-FCP-NEXT:    vmovdqa (%r8), %xmm1
1975; AVX2-FCP-NEXT:    vmovdqa (%r9), %xmm2
1976; AVX2-FCP-NEXT:    vmovdqa (%r10), %xmm0
1977; AVX2-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm3, %ymm3
1978; AVX2-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm4, %ymm4
1979; AVX2-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm6
1980; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm0[1,1,0,0,4,5,6,7]
1981; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [0,1,0,1,2,0,0,1]
1982; AVX2-FCP-NEXT:    vpermd %ymm5, %ymm7, %ymm5
1983; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2]
1984; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u]
1985; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u]
1986; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm7, %ymm5, %ymm5
1987; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2]
1988; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28]
1989; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm3[0,2,0,2]
1990; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,zero,zero,zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[2,10],zero,zero,zero,zero,zero,ymm8[19,27],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero
1991; AVX2-FCP-NEXT:    vpor %ymm7, %ymm8, %ymm7
1992; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
1993; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm7, %ymm5, %ymm5
1994; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
1995; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0]
1996; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [1,5,2,6,1,5,2,6]
1997; AVX2-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
1998; AVX2-FCP-NEXT:    vpermd %ymm6, %ymm8, %ymm9
1999; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[0,4,u,u,u,u,u,1,5,u,u,u,u,u,2,6,u,u,u,u,u,19,23,u,u,u,u,u,24,28,u,u]
2000; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u]
2001; AVX2-FCP-NEXT:    vpblendvb %ymm10, %ymm9, %ymm7, %ymm7
2002; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm8, %ymm9
2003; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,ymm9[1,5],zero,zero,zero,zero,zero,ymm9[2,6],zero,zero,zero,zero,zero,ymm9[19,23],zero,zero,zero,zero,zero,ymm9[24,28],zero,zero,zero,zero
2004; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm8, %ymm8
2005; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm8[1,5],zero,zero,zero,zero,zero,ymm8[2,6],zero,zero,zero,zero,zero,ymm8[19,23],zero,zero,zero,zero,zero,ymm8[24,28],zero,zero,zero,zero,zero,ymm8[25]
2006; AVX2-FCP-NEXT:    vpor %ymm9, %ymm8, %ymm8
2007; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
2008; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm8, %ymm7, %ymm7
2009; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6]
2010; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3]
2011; AVX2-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
2012; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm9, %ymm8
2013; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3]
2014; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
2015; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255]
2016; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm6, %ymm8, %ymm6
2017; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm3[3,1,1,3]
2018; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero
2019; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm4[1,3,3,1]
2020; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[3,19],zero,zero,zero,zero,zero,ymm9[28,20],zero,zero,zero,zero,zero,ymm9[29,21],zero
2021; AVX2-FCP-NEXT:    vpor %ymm8, %ymm9, %ymm8
2022; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
2023; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm8, %ymm6, %ymm6
2024; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
2025; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero
2026; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3]
2027; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero,zero,zero
2028; AVX2-FCP-NEXT:    vpor %xmm4, %xmm3, %xmm3
2029; AVX2-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2030; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero
2031; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[13,14,15,4,5],zero,zero,xmm0[14,15,14,15,12],zero,zero,xmm0[15]
2032; AVX2-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
2033; AVX2-FCP-NEXT:    vmovdqa %ymm6, 64(%rax)
2034; AVX2-FCP-NEXT:    vmovdqa %ymm7, 32(%rax)
2035; AVX2-FCP-NEXT:    vmovdqa %ymm5, (%rax)
2036; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0]
2037; AVX2-FCP-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
2038; AVX2-FCP-NEXT:    vmovdqa %xmm0, 96(%rax)
2039; AVX2-FCP-NEXT:    vzeroupper
2040; AVX2-FCP-NEXT:    retq
2041;
2042; AVX512-LABEL: store_i8_stride7_vf16:
2043; AVX512:       # %bb.0:
2044; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2045; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
2046; AVX512-NEXT:    vmovdqa (%rdi), %xmm3
2047; AVX512-NEXT:    vmovdqa (%rdx), %xmm4
2048; AVX512-NEXT:    vmovdqa (%r8), %xmm1
2049; AVX512-NEXT:    vmovdqa (%r9), %xmm2
2050; AVX512-NEXT:    vmovdqa (%r10), %xmm0
2051; AVX512-NEXT:    vinserti128 $1, (%rsi), %ymm3, %ymm3
2052; AVX512-NEXT:    vinserti128 $1, (%rcx), %ymm4, %ymm4
2053; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm5
2054; AVX512-NEXT:    vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,5],zero,ymm4[u,u,u,u,u,6],zero,ymm4[u,u,u,u,u],zero,ymm4[23,u,u,u,u,u],zero,ymm4[24,u,u,u,u]
2055; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1]
2056; AVX512-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u]
2057; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
2058; AVX512-NEXT:    vpternlogq {{.*#+}} ymm7 = ~ymm8 & (ymm7 | ymm6)
2059; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2]
2060; AVX512-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u,u,u],zero,zero,ymm6[1,9,u,u,u],zero,zero,ymm6[18,26,u,u,u],zero,zero,ymm6[19,27,u,u,u],zero,zero,ymm6[20,28]
2061; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
2062; AVX512-NEXT:    vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u,u],zero
2063; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm3[2,3,0,1]
2064; AVX512-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u],zero,ymm9[5,u,u,u,u,u],zero,ymm9[6,u,u,u,u,u,23],zero,ymm9[u,u,u,u,u,24],zero,ymm9[u,u,u,u,u,25]
2065; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm8 & (ymm9 | ymm7)
2066; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2]
2067; AVX512-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[0,8],zero,zero,ymm7[u,u,u,1,9],zero,zero,ymm7[u,u,u,2,10],zero,zero,ymm7[u,u,u,19,27],zero,zero,ymm7[u,u,u,20,28],zero,zero
2068; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
2069; AVX512-NEXT:    vporq %zmm6, %zmm7, %zmm6
2070; AVX512-NEXT:    vpshufb {{.*#+}} ymm7 = ymm5[4],zero,ymm5[u,u,u,u,u,5],zero,ymm5[u,u,u,u,u,6],zero,ymm5[u,u,u,u,u],zero,ymm5[23,u,u,u,u,u],zero,ymm5[24,u,u]
2071; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1]
2072; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = zero,ymm8[4,u,u,u,u,u],zero,ymm8[5,u,u,u,u,u],zero,ymm8[6,u,u,u,u,u,23],zero,ymm8[u,u,u,u,u,24],zero,ymm8[u,u]
2073; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255]
2074; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm9 & (ymm8 | ymm7)
2075; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm5[0,2,0,2]
2076; AVX512-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8],zero,ymm7[u,u,u,u,1,9],zero,ymm7[u,u,u,u,18,26],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u]
2077; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
2078; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
2079; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0]
2080; AVX512-NEXT:    vpandn %ymm8, %ymm9, %ymm8
2081; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm0[1,1,0,0,4,5,6,7]
2082; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,0]
2083; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0]
2084; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9
2085; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm9, %zmm8
2086; AVX512-NEXT:    vporq %zmm8, %zmm7, %zmm7
2087; AVX512-NEXT:    vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm6))
2088; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[3,1,1,3]
2089; AVX512-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u]
2090; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1]
2091; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u]
2092; AVX512-NEXT:    vpor %ymm6, %ymm8, %ymm6
2093; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6]
2094; AVX512-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3]
2095; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
2096; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3]
2097; AVX512-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21]
2098; AVX512-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm8 & ~mem)
2099; AVX512-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm6))
2100; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
2101; AVX512-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u]
2102; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3]
2103; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u]
2104; AVX512-NEXT:    vpor %xmm4, %xmm3, %xmm3
2105; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2106; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero
2107; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15]
2108; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
2109; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3))
2110; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm5, %zmm1
2111; AVX512-NEXT:    vmovdqa %xmm0, 96(%rax)
2112; AVX512-NEXT:    vmovdqa %ymm1, 64(%rax)
2113; AVX512-NEXT:    vmovdqa64 %zmm7, (%rax)
2114; AVX512-NEXT:    vzeroupper
2115; AVX512-NEXT:    retq
2116;
2117; AVX512-FCP-LABEL: store_i8_stride7_vf16:
2118; AVX512-FCP:       # %bb.0:
2119; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2120; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
2121; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm3
2122; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm4
2123; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm1
2124; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm2
2125; AVX512-FCP-NEXT:    vmovdqa (%r10), %xmm0
2126; AVX512-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm3, %ymm3
2127; AVX512-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm4, %ymm4
2128; AVX512-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm5
2129; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2]
2130; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u,u,u],zero,zero,ymm6[1,9,u,u,u],zero,zero,ymm6[18,26,u,u,u],zero,zero,ymm6[19,27,u,u,u],zero,zero,ymm6[20,28]
2131; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [1,5,2,6,1,5,2,6]
2132; AVX512-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
2133; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm7, %ymm8
2134; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u],zero,zero,ymm8[1,5,u,u,u],zero,zero,ymm8[2,6,u,u,u],zero,zero,ymm8[19,23,u,u,u],zero,zero,ymm8[24,28,u,u,u],zero
2135; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm6
2136; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm3[0,2,0,2]
2137; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,ymm8[u,u,u,1,9],zero,zero,ymm8[u,u,u,2,10],zero,zero,ymm8[u,u,u,19,27],zero,zero,ymm8[u,u,u,20,28],zero,zero
2138; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm7, %ymm9
2139; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,5],zero,zero,ymm9[u,u,u,2,6],zero,zero,ymm9[u,u,u,19,23],zero,zero,ymm9[u,u,u,24,28],zero,zero,ymm9[u,u,u,25]
2140; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
2141; AVX512-FCP-NEXT:    vporq %zmm6, %zmm8, %zmm6
2142; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7]
2143; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,1,0,1,0,0,0,0]
2144; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm9, %ymm8
2145; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
2146; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0]
2147; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
2148; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm5[0,2,0,2]
2149; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,0,8],zero,ymm9[u,u,u,u,1,9],zero,ymm9[u,u,u,u,18,26],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u]
2150; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm7, %ymm7
2151; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[0,4],zero,ymm7[u,u,u,u,1,5],zero,ymm7[u,u,u,u,2,6],zero,ymm7[u,u,u,u,19,23],zero,ymm7[u,u,u,u,24,28],zero,ymm7[u]
2152; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm9, %zmm7
2153; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm8 & mem)
2154; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm6))
2155; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[3,1,1,3]
2156; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u]
2157; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1]
2158; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u]
2159; AVX512-FCP-NEXT:    vpor %ymm6, %ymm8, %ymm6
2160; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6]
2161; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3]
2162; AVX512-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
2163; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm9, %ymm8
2164; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3]
2165; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21]
2166; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm8 & ~mem)
2167; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm6))
2168; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
2169; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u]
2170; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3]
2171; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u]
2172; AVX512-FCP-NEXT:    vpor %xmm4, %xmm3, %xmm3
2173; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2174; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero
2175; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15]
2176; AVX512-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
2177; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3))
2178; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm5, %zmm1
2179; AVX512-FCP-NEXT:    vmovdqa %xmm0, 96(%rax)
2180; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, (%rax)
2181; AVX512-FCP-NEXT:    vmovdqa %ymm1, 64(%rax)
2182; AVX512-FCP-NEXT:    vzeroupper
2183; AVX512-FCP-NEXT:    retq
2184;
2185; AVX512DQ-LABEL: store_i8_stride7_vf16:
2186; AVX512DQ:       # %bb.0:
2187; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2188; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
2189; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm3
2190; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm4
2191; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm1
2192; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm2
2193; AVX512DQ-NEXT:    vmovdqa (%r10), %xmm0
2194; AVX512DQ-NEXT:    vinserti128 $1, (%rsi), %ymm3, %ymm3
2195; AVX512DQ-NEXT:    vinserti128 $1, (%rcx), %ymm4, %ymm4
2196; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm5
2197; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,5],zero,ymm4[u,u,u,u,u,6],zero,ymm4[u,u,u,u,u],zero,ymm4[23,u,u,u,u,u],zero,ymm4[24,u,u,u,u]
2198; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1]
2199; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u]
2200; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
2201; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm7 = ~ymm8 & (ymm7 | ymm6)
2202; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2]
2203; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u,u,u],zero,zero,ymm6[1,9,u,u,u],zero,zero,ymm6[18,26,u,u,u],zero,zero,ymm6[19,27,u,u,u],zero,zero,ymm6[20,28]
2204; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
2205; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u,u],zero
2206; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm3[2,3,0,1]
2207; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u],zero,ymm9[5,u,u,u,u,u],zero,ymm9[6,u,u,u,u,u,23],zero,ymm9[u,u,u,u,u,24],zero,ymm9[u,u,u,u,u,25]
2208; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm8 & (ymm9 | ymm7)
2209; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2]
2210; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[0,8],zero,zero,ymm7[u,u,u,1,9],zero,zero,ymm7[u,u,u,2,10],zero,zero,ymm7[u,u,u,19,27],zero,zero,ymm7[u,u,u,20,28],zero,zero
2211; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
2212; AVX512DQ-NEXT:    vporq %zmm6, %zmm7, %zmm6
2213; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm7 = ymm5[4],zero,ymm5[u,u,u,u,u,5],zero,ymm5[u,u,u,u,u,6],zero,ymm5[u,u,u,u,u],zero,ymm5[23,u,u,u,u,u],zero,ymm5[24,u,u]
2214; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1]
2215; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = zero,ymm8[4,u,u,u,u,u],zero,ymm8[5,u,u,u,u,u],zero,ymm8[6,u,u,u,u,u,23],zero,ymm8[u,u,u,u,u,24],zero,ymm8[u,u]
2216; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255]
2217; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm9 & (ymm8 | ymm7)
2218; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm5[0,2,0,2]
2219; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8],zero,ymm7[u,u,u,u,1,9],zero,ymm7[u,u,u,u,18,26],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u]
2220; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
2221; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
2222; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0]
2223; AVX512DQ-NEXT:    vpandn %ymm8, %ymm9, %ymm8
2224; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm0[1,1,0,0,4,5,6,7]
2225; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,0]
2226; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0]
2227; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9
2228; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm9, %zmm8
2229; AVX512DQ-NEXT:    vporq %zmm8, %zmm7, %zmm7
2230; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm6))
2231; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[3,1,1,3]
2232; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u]
2233; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1]
2234; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u]
2235; AVX512DQ-NEXT:    vpor %ymm6, %ymm8, %ymm6
2236; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6]
2237; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3]
2238; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
2239; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3]
2240; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21]
2241; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm8 & ~mem)
2242; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm6))
2243; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
2244; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u]
2245; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3]
2246; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u]
2247; AVX512DQ-NEXT:    vpor %xmm4, %xmm3, %xmm3
2248; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2249; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero
2250; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15]
2251; AVX512DQ-NEXT:    vpor %xmm0, %xmm1, %xmm0
2252; AVX512DQ-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3))
2253; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm5, %zmm1
2254; AVX512DQ-NEXT:    vmovdqa %xmm0, 96(%rax)
2255; AVX512DQ-NEXT:    vmovdqa %ymm1, 64(%rax)
2256; AVX512DQ-NEXT:    vmovdqa64 %zmm7, (%rax)
2257; AVX512DQ-NEXT:    vzeroupper
2258; AVX512DQ-NEXT:    retq
2259;
2260; AVX512DQ-FCP-LABEL: store_i8_stride7_vf16:
2261; AVX512DQ-FCP:       # %bb.0:
2262; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2263; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
2264; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm3
2265; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm4
2266; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm1
2267; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm2
2268; AVX512DQ-FCP-NEXT:    vmovdqa (%r10), %xmm0
2269; AVX512DQ-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm3, %ymm3
2270; AVX512DQ-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm4, %ymm4
2271; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm5
2272; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2]
2273; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u,u,u],zero,zero,ymm6[1,9,u,u,u],zero,zero,ymm6[18,26,u,u,u],zero,zero,ymm6[19,27,u,u,u],zero,zero,ymm6[20,28]
2274; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [1,5,2,6,1,5,2,6]
2275; AVX512DQ-FCP-NEXT:    # ymm7 = mem[0,1,0,1]
2276; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm7, %ymm8
2277; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u],zero,zero,ymm8[1,5,u,u,u],zero,zero,ymm8[2,6,u,u,u],zero,zero,ymm8[19,23,u,u,u],zero,zero,ymm8[24,28,u,u,u],zero
2278; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm6
2279; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm3[0,2,0,2]
2280; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,ymm8[u,u,u,1,9],zero,zero,ymm8[u,u,u,2,10],zero,zero,ymm8[u,u,u,19,27],zero,zero,ymm8[u,u,u,20,28],zero,zero
2281; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm7, %ymm9
2282; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,5],zero,zero,ymm9[u,u,u,2,6],zero,zero,ymm9[u,u,u,19,23],zero,zero,ymm9[u,u,u,24,28],zero,zero,ymm9[u,u,u,25]
2283; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
2284; AVX512DQ-FCP-NEXT:    vporq %zmm6, %zmm8, %zmm6
2285; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7]
2286; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,1,0,1,0,0,0,0]
2287; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm9, %ymm8
2288; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
2289; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0]
2290; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
2291; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm5[0,2,0,2]
2292; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,0,8],zero,ymm9[u,u,u,u,1,9],zero,ymm9[u,u,u,u,18,26],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u]
2293; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm7, %ymm7
2294; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[0,4],zero,ymm7[u,u,u,u,1,5],zero,ymm7[u,u,u,u,2,6],zero,ymm7[u,u,u,u,19,23],zero,ymm7[u,u,u,u,24,28],zero,ymm7[u]
2295; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm9, %zmm7
2296; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm8 & mem)
2297; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm6))
2298; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm3[3,1,1,3]
2299; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u]
2300; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1]
2301; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u]
2302; AVX512DQ-FCP-NEXT:    vpor %ymm6, %ymm8, %ymm6
2303; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6]
2304; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3]
2305; AVX512DQ-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
2306; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm9, %ymm8
2307; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3]
2308; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21]
2309; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm8 & ~mem)
2310; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm6))
2311; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
2312; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u]
2313; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3]
2314; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u]
2315; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm3, %xmm3
2316; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2317; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero
2318; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15]
2319; AVX512DQ-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
2320; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3))
2321; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm5, %zmm1
2322; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, 96(%rax)
2323; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, (%rax)
2324; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, 64(%rax)
2325; AVX512DQ-FCP-NEXT:    vzeroupper
2326; AVX512DQ-FCP-NEXT:    retq
2327;
2328; AVX512BW-LABEL: store_i8_stride7_vf16:
2329; AVX512BW:       # %bb.0:
2330; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2331; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
2332; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm1
2333; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm0
2334; AVX512BW-NEXT:    vmovdqa (%r8), %xmm2
2335; AVX512BW-NEXT:    vinserti128 $1, (%rcx), %ymm0, %ymm0
2336; AVX512BW-NEXT:    vinserti128 $1, (%rsi), %ymm1, %ymm1
2337; AVX512BW-NEXT:    vinserti128 $1, (%r9), %ymm2, %ymm2
2338; AVX512BW-NEXT:    vinserti32x4 $2, (%r10), %zmm2, %zmm2
2339; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
2340; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
2341; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u]
2342; AVX512BW-NEXT:    vextracti64x4 $1, %zmm2, %ymm4
2343; AVX512BW-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,7,7,7]
2344; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,1,3,2]
2345; AVX512BW-NEXT:    movw $-32510, %cx # imm = 0x8102
2346; AVX512BW-NEXT:    kmovd %ecx, %k1
2347; AVX512BW-NEXT:    vmovdqu8 %xmm5, %xmm3 {%k1}
2348; AVX512BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[1,3,2,3]
2349; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[6,14],zero,zero,zero,zero,zero,xmm5[7,15],zero,zero,zero,zero,zero
2350; AVX512BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm0[1,3,2,3]
2351; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[6,14],zero,zero,zero,zero,zero,xmm6[7,15],zero,zero,zero
2352; AVX512BW-NEXT:    vpor %xmm5, %xmm6, %xmm5
2353; AVX512BW-NEXT:    movw $-7741, %cx # imm = 0xE1C3
2354; AVX512BW-NEXT:    kmovd %ecx, %k1
2355; AVX512BW-NEXT:    vmovdqu8 %xmm3, %xmm5 {%k1}
2356; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
2357; AVX512BW-NEXT:    # ymm3 = mem[0,1,0,1]
2358; AVX512BW-NEXT:    vpermw %ymm4, %ymm3, %ymm3
2359; AVX512BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm2[1,3,1,3]
2360; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
2361; AVX512BW-NEXT:    movl $67637280, %ecx # imm = 0x4081020
2362; AVX512BW-NEXT:    kmovd %ecx, %k1
2363; AVX512BW-NEXT:    vmovdqu8 %ymm3, %ymm6 {%k1}
2364; AVX512BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[1,3,3,1]
2365; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm3 = zero,ymm3[1,9],zero,zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,zero,ymm3[3,19],zero,zero,zero,zero,zero,ymm3[28,20],zero,zero,zero,zero,zero,ymm3[29,21],zero
2366; AVX512BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm1[3,1,1,3]
2367; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero
2368; AVX512BW-NEXT:    vpor %ymm3, %ymm7, %ymm3
2369; AVX512BW-NEXT:    movl $-2029118408, %ecx # imm = 0x870E1C38
2370; AVX512BW-NEXT:    kmovd %ecx, %k1
2371; AVX512BW-NEXT:    vmovdqu8 %ymm6, %ymm3 {%k1}
2372; AVX512BW-NEXT:    vinserti32x4 $2, %xmm5, %zmm3, %zmm3
2373; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52]
2374; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm4, %zmm6
2375; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm2[4],zero,zero,zero,zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero
2376; AVX512BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
2377; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm7 = zero,ymm7[4],zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero
2378; AVX512BW-NEXT:    vpor %ymm4, %ymm7, %ymm4
2379; AVX512BW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
2380; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u]
2381; AVX512BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
2382; AVX512BW-NEXT:    movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040
2383; AVX512BW-NEXT:    kmovq %rcx, %k1
2384; AVX512BW-NEXT:    vmovdqu8 %zmm6, %zmm2 {%k1}
2385; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero
2386; AVX512BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1]
2387; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero
2388; AVX512BW-NEXT:    vpor %ymm4, %ymm6, %ymm4
2389; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero,zero,zero
2390; AVX512BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1]
2391; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25]
2392; AVX512BW-NEXT:    vpor %ymm7, %ymm6, %ymm6
2393; AVX512BW-NEXT:    movl $202911840, %ecx # imm = 0xC183060
2394; AVX512BW-NEXT:    kmovd %ecx, %k1
2395; AVX512BW-NEXT:    vmovdqu8 %ymm4, %ymm6 {%k1}
2396; AVX512BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2]
2397; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[19,27],zero,zero,zero,zero,zero,ymm1[20,28],zero,zero
2398; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
2399; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8],zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,ymm0[20,28]
2400; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
2401; AVX512BW-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
2402; AVX512BW-NEXT:    movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870
2403; AVX512BW-NEXT:    kmovq %rcx, %k1
2404; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
2405; AVX512BW-NEXT:    vmovdqa %xmm5, 96(%rax)
2406; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rax)
2407; AVX512BW-NEXT:    vmovdqa %ymm3, 64(%rax)
2408; AVX512BW-NEXT:    vzeroupper
2409; AVX512BW-NEXT:    retq
2410;
2411; AVX512BW-FCP-LABEL: store_i8_stride7_vf16:
2412; AVX512BW-FCP:       # %bb.0:
2413; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2414; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
2415; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm1
2416; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %xmm0
2417; AVX512BW-FCP-NEXT:    vmovdqa (%r8), %xmm2
2418; AVX512BW-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm0, %ymm0
2419; AVX512BW-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm1, %ymm1
2420; AVX512BW-FCP-NEXT:    vinserti128 $1, (%r9), %ymm2, %ymm2
2421; AVX512BW-FCP-NEXT:    vinserti32x4 $2, (%r10), %zmm2, %zmm2
2422; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[1,3,2,3]
2423; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero,zero,zero
2424; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[1,3,2,3]
2425; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero
2426; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
2427; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
2428; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
2429; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[10],zero,zero,zero,zero,zero,xmm4[13,12],zero,zero,zero,zero,zero,xmm4[15,14],zero
2430; AVX512BW-FCP-NEXT:    vextracti64x4 $1, %zmm2, %ymm5
2431; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,xmm5[13],zero,zero,zero,zero,zero,zero,xmm5[14],zero,zero,zero,zero,zero,zero,xmm5[15]
2432; AVX512BW-FCP-NEXT:    vpor %xmm4, %xmm6, %xmm4
2433; AVX512BW-FCP-NEXT:    movw $-7741, %cx # imm = 0xE1C3
2434; AVX512BW-FCP-NEXT:    kmovd %ecx, %k1
2435; AVX512BW-FCP-NEXT:    vmovdqu8 %xmm4, %xmm3 {%k1}
2436; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
2437; AVX512BW-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
2438; AVX512BW-FCP-NEXT:    vpermw %ymm5, %ymm4, %ymm4
2439; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm2[1,3,1,3]
2440; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
2441; AVX512BW-FCP-NEXT:    movl $67637280, %ecx # imm = 0x4081020
2442; AVX512BW-FCP-NEXT:    kmovd %ecx, %k1
2443; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm4, %ymm6 {%k1}
2444; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[1,3,3,1]
2445; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10],zero,zero,zero,zero,zero,ymm4[3,19],zero,zero,zero,zero,zero,ymm4[28,20],zero,zero,zero,zero,zero,ymm4[29,21],zero
2446; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm1[3,1,1,3]
2447; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero
2448; AVX512BW-FCP-NEXT:    vpor %ymm4, %ymm7, %ymm4
2449; AVX512BW-FCP-NEXT:    movl $-2029118408, %ecx # imm = 0x870E1C38
2450; AVX512BW-FCP-NEXT:    kmovd %ecx, %k1
2451; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm6, %ymm4 {%k1}
2452; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm3, %zmm4, %zmm4
2453; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52]
2454; AVX512BW-FCP-NEXT:    vpermi2w %zmm2, %zmm5, %zmm6
2455; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,5,2,6,1,5,2,6]
2456; AVX512BW-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
2457; AVX512BW-FCP-NEXT:    vpermd %ymm2, %ymm5, %ymm7
2458; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
2459; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm2, %zmm2
2460; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u]
2461; AVX512BW-FCP-NEXT:    movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040
2462; AVX512BW-FCP-NEXT:    kmovq %rcx, %k1
2463; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm6, %zmm2 {%k1}
2464; AVX512BW-FCP-NEXT:    vpermd %ymm1, %ymm5, %ymm6
2465; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2]
2466; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm1, %zmm1
2467; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zmm1[33,37],zero,zero,zero,zero,zero,zmm1[34,38],zero,zero,zero,zero,zero,zmm1[51,55],zero,zero,zero,zero,zero,zmm1[56,60],zero,zero,zero,zero,zero,zmm1[57]
2468; AVX512BW-FCP-NEXT:    vpermd %ymm0, %ymm5, %ymm5
2469; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
2470; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm0
2471; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[0,8],zero,zero,zero,zero,zero,zmm0[1,9],zero,zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zmm0[20,28],zero,zero,zero,zero,zero,zmm0[33,37],zero,zero,zero,zero,zero,zmm0[34,38],zero,zero,zero,zero,zero,zmm0[51,55],zero,zero,zero,zero,zero,zmm0[56,60],zero,zero,zero,zero
2472; AVX512BW-FCP-NEXT:    vporq %zmm1, %zmm0, %zmm0
2473; AVX512BW-FCP-NEXT:    movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870
2474; AVX512BW-FCP-NEXT:    kmovq %rcx, %k1
2475; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
2476; AVX512BW-FCP-NEXT:    vmovdqa %xmm3, 96(%rax)
2477; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
2478; AVX512BW-FCP-NEXT:    vmovdqa %ymm4, 64(%rax)
2479; AVX512BW-FCP-NEXT:    vzeroupper
2480; AVX512BW-FCP-NEXT:    retq
2481;
2482; AVX512DQ-BW-LABEL: store_i8_stride7_vf16:
2483; AVX512DQ-BW:       # %bb.0:
2484; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2485; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
2486; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm1
2487; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm0
2488; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %xmm2
2489; AVX512DQ-BW-NEXT:    vinserti128 $1, (%rcx), %ymm0, %ymm0
2490; AVX512DQ-BW-NEXT:    vinserti128 $1, (%rsi), %ymm1, %ymm1
2491; AVX512DQ-BW-NEXT:    vinserti128 $1, (%r9), %ymm2, %ymm2
2492; AVX512DQ-BW-NEXT:    vinserti32x4 $2, (%r10), %zmm2, %zmm2
2493; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
2494; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
2495; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u]
2496; AVX512DQ-BW-NEXT:    vextracti64x4 $1, %zmm2, %ymm4
2497; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,7,7,7]
2498; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,1,3,2]
2499; AVX512DQ-BW-NEXT:    movw $-32510, %cx # imm = 0x8102
2500; AVX512DQ-BW-NEXT:    kmovd %ecx, %k1
2501; AVX512DQ-BW-NEXT:    vmovdqu8 %xmm5, %xmm3 {%k1}
2502; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[1,3,2,3]
2503; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[6,14],zero,zero,zero,zero,zero,xmm5[7,15],zero,zero,zero,zero,zero
2504; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm0[1,3,2,3]
2505; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[6,14],zero,zero,zero,zero,zero,xmm6[7,15],zero,zero,zero
2506; AVX512DQ-BW-NEXT:    vpor %xmm5, %xmm6, %xmm5
2507; AVX512DQ-BW-NEXT:    movw $-7741, %cx # imm = 0xE1C3
2508; AVX512DQ-BW-NEXT:    kmovd %ecx, %k1
2509; AVX512DQ-BW-NEXT:    vmovdqu8 %xmm3, %xmm5 {%k1}
2510; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
2511; AVX512DQ-BW-NEXT:    # ymm3 = mem[0,1,0,1]
2512; AVX512DQ-BW-NEXT:    vpermw %ymm4, %ymm3, %ymm3
2513; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm2[1,3,1,3]
2514; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
2515; AVX512DQ-BW-NEXT:    movl $67637280, %ecx # imm = 0x4081020
2516; AVX512DQ-BW-NEXT:    kmovd %ecx, %k1
2517; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm3, %ymm6 {%k1}
2518; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[1,3,3,1]
2519; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm3 = zero,ymm3[1,9],zero,zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,zero,ymm3[3,19],zero,zero,zero,zero,zero,ymm3[28,20],zero,zero,zero,zero,zero,ymm3[29,21],zero
2520; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm1[3,1,1,3]
2521; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero
2522; AVX512DQ-BW-NEXT:    vpor %ymm3, %ymm7, %ymm3
2523; AVX512DQ-BW-NEXT:    movl $-2029118408, %ecx # imm = 0x870E1C38
2524; AVX512DQ-BW-NEXT:    kmovd %ecx, %k1
2525; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm6, %ymm3 {%k1}
2526; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm5, %zmm3, %zmm3
2527; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52]
2528; AVX512DQ-BW-NEXT:    vpermi2w %zmm2, %zmm4, %zmm6
2529; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm2[4],zero,zero,zero,zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero
2530; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
2531; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm7 = zero,ymm7[4],zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero
2532; AVX512DQ-BW-NEXT:    vpor %ymm4, %ymm7, %ymm4
2533; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
2534; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u]
2535; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
2536; AVX512DQ-BW-NEXT:    movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040
2537; AVX512DQ-BW-NEXT:    kmovq %rcx, %k1
2538; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm6, %zmm2 {%k1}
2539; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero
2540; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1]
2541; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero
2542; AVX512DQ-BW-NEXT:    vpor %ymm4, %ymm6, %ymm4
2543; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero,zero,zero
2544; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1]
2545; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25]
2546; AVX512DQ-BW-NEXT:    vpor %ymm7, %ymm6, %ymm6
2547; AVX512DQ-BW-NEXT:    movl $202911840, %ecx # imm = 0xC183060
2548; AVX512DQ-BW-NEXT:    kmovd %ecx, %k1
2549; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm4, %ymm6 {%k1}
2550; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2]
2551; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[19,27],zero,zero,zero,zero,zero,ymm1[20,28],zero,zero
2552; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
2553; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8],zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,ymm0[20,28]
2554; AVX512DQ-BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
2555; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
2556; AVX512DQ-BW-NEXT:    movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870
2557; AVX512DQ-BW-NEXT:    kmovq %rcx, %k1
2558; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
2559; AVX512DQ-BW-NEXT:    vmovdqa %xmm5, 96(%rax)
2560; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, (%rax)
2561; AVX512DQ-BW-NEXT:    vmovdqa %ymm3, 64(%rax)
2562; AVX512DQ-BW-NEXT:    vzeroupper
2563; AVX512DQ-BW-NEXT:    retq
2564;
2565; AVX512DQ-BW-FCP-LABEL: store_i8_stride7_vf16:
2566; AVX512DQ-BW-FCP:       # %bb.0:
2567; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2568; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
2569; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm1
2570; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %xmm0
2571; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r8), %xmm2
2572; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm0, %ymm0
2573; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm1, %ymm1
2574; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, (%r9), %ymm2, %ymm2
2575; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, (%r10), %zmm2, %zmm2
2576; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[1,3,2,3]
2577; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero,zero,zero
2578; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[1,3,2,3]
2579; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero
2580; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
2581; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm4
2582; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
2583; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[10],zero,zero,zero,zero,zero,xmm4[13,12],zero,zero,zero,zero,zero,xmm4[15,14],zero
2584; AVX512DQ-BW-FCP-NEXT:    vextracti64x4 $1, %zmm2, %ymm5
2585; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,xmm5[13],zero,zero,zero,zero,zero,zero,xmm5[14],zero,zero,zero,zero,zero,zero,xmm5[15]
2586; AVX512DQ-BW-FCP-NEXT:    vpor %xmm4, %xmm6, %xmm4
2587; AVX512DQ-BW-FCP-NEXT:    movw $-7741, %cx # imm = 0xE1C3
2588; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k1
2589; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %xmm4, %xmm3 {%k1}
2590; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
2591; AVX512DQ-BW-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
2592; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm5, %ymm4, %ymm4
2593; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm2[1,3,1,3]
2594; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
2595; AVX512DQ-BW-FCP-NEXT:    movl $67637280, %ecx # imm = 0x4081020
2596; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k1
2597; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm4, %ymm6 {%k1}
2598; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[1,3,3,1]
2599; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10],zero,zero,zero,zero,zero,ymm4[3,19],zero,zero,zero,zero,zero,ymm4[28,20],zero,zero,zero,zero,zero,ymm4[29,21],zero
2600; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm1[3,1,1,3]
2601; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero
2602; AVX512DQ-BW-FCP-NEXT:    vpor %ymm4, %ymm7, %ymm4
2603; AVX512DQ-BW-FCP-NEXT:    movl $-2029118408, %ecx # imm = 0x870E1C38
2604; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k1
2605; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm6, %ymm4 {%k1}
2606; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm3, %zmm4, %zmm4
2607; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52]
2608; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm2, %zmm5, %zmm6
2609; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,5,2,6,1,5,2,6]
2610; AVX512DQ-BW-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
2611; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm2, %ymm5, %ymm7
2612; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
2613; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm2, %zmm2
2614; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u]
2615; AVX512DQ-BW-FCP-NEXT:    movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040
2616; AVX512DQ-BW-FCP-NEXT:    kmovq %rcx, %k1
2617; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm6, %zmm2 {%k1}
2618; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm1, %ymm5, %ymm6
2619; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2]
2620; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm1, %zmm1
2621; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zmm1[33,37],zero,zero,zero,zero,zero,zmm1[34,38],zero,zero,zero,zero,zero,zmm1[51,55],zero,zero,zero,zero,zero,zmm1[56,60],zero,zero,zero,zero,zero,zmm1[57]
2622; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm0, %ymm5, %ymm5
2623; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
2624; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm0
2625; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[0,8],zero,zero,zero,zero,zero,zmm0[1,9],zero,zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zmm0[20,28],zero,zero,zero,zero,zero,zmm0[33,37],zero,zero,zero,zero,zero,zmm0[34,38],zero,zero,zero,zero,zero,zmm0[51,55],zero,zero,zero,zero,zero,zmm0[56,60],zero,zero,zero,zero
2626; AVX512DQ-BW-FCP-NEXT:    vporq %zmm1, %zmm0, %zmm0
2627; AVX512DQ-BW-FCP-NEXT:    movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870
2628; AVX512DQ-BW-FCP-NEXT:    kmovq %rcx, %k1
2629; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
2630; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm3, 96(%rax)
2631; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
2632; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm4, 64(%rax)
2633; AVX512DQ-BW-FCP-NEXT:    vzeroupper
2634; AVX512DQ-BW-FCP-NEXT:    retq
2635  %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64
2636  %in.vec1 = load <16 x i8>, ptr %in.vecptr1, align 64
2637  %in.vec2 = load <16 x i8>, ptr %in.vecptr2, align 64
2638  %in.vec3 = load <16 x i8>, ptr %in.vecptr3, align 64
2639  %in.vec4 = load <16 x i8>, ptr %in.vecptr4, align 64
2640  %in.vec5 = load <16 x i8>, ptr %in.vecptr5, align 64
2641  %in.vec6 = load <16 x i8>, ptr %in.vecptr6, align 64
2642  %1 = shufflevector <16 x i8> %in.vec0, <16 x i8> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2643  %2 = shufflevector <16 x i8> %in.vec2, <16 x i8> %in.vec3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2644  %3 = shufflevector <16 x i8> %in.vec4, <16 x i8> %in.vec5, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2645  %4 = shufflevector <32 x i8> %1, <32 x i8> %2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
2646  %5 = shufflevector <16 x i8> %in.vec6, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2647  %6 = shufflevector <32 x i8> %3, <32 x i8> %5, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
2648  %7 = shufflevector <48 x i8> %6, <48 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2649  %8 = shufflevector <64 x i8> %4, <64 x i8> %7, <112 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111>
2650  %interleaved.vec = shufflevector <112 x i8> %8, <112 x i8> poison, <112 x i32> <i32 0, i32 16, i32 32, i32 48, i32 64, i32 80, i32 96, i32 1, i32 17, i32 33, i32 49, i32 65, i32 81, i32 97, i32 2, i32 18, i32 34, i32 50, i32 66, i32 82, i32 98, i32 3, i32 19, i32 35, i32 51, i32 67, i32 83, i32 99, i32 4, i32 20, i32 36, i32 52, i32 68, i32 84, i32 100, i32 5, i32 21, i32 37, i32 53, i32 69, i32 85, i32 101, i32 6, i32 22, i32 38, i32 54, i32 70, i32 86, i32 102, i32 7, i32 23, i32 39, i32 55, i32 71, i32 87, i32 103, i32 8, i32 24, i32 40, i32 56, i32 72, i32 88, i32 104, i32 9, i32 25, i32 41, i32 57, i32 73, i32 89, i32 105, i32 10, i32 26, i32 42, i32 58, i32 74, i32 90, i32 106, i32 11, i32 27, i32 43, i32 59, i32 75, i32 91, i32 107, i32 12, i32 28, i32 44, i32 60, i32 76, i32 92, i32 108, i32 13, i32 29, i32 45, i32 61, i32 77, i32 93, i32 109, i32 14, i32 30, i32 46, i32 62, i32 78, i32 94, i32 110, i32 15, i32 31, i32 47, i32 63, i32 79, i32 95, i32 111>
2651  store <112 x i8> %interleaved.vec, ptr %out.vec, align 64
2652  ret void
2653}
2654
2655define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind {
2656; SSE-LABEL: store_i8_stride7_vf32:
2657; SSE:       # %bb.0:
2658; SSE-NEXT:    subq $360, %rsp # imm = 0x168
2659; SSE-NEXT:    movdqa 16(%rdi), %xmm15
2660; SSE-NEXT:    movdqa 16(%rsi), %xmm4
2661; SSE-NEXT:    movdqa 16(%rdx), %xmm3
2662; SSE-NEXT:    movdqa 16(%rcx), %xmm7
2663; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2664; SSE-NEXT:    movdqa 16(%r8), %xmm6
2665; SSE-NEXT:    movdqa 16(%r9), %xmm5
2666; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2667; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,6,6,6,6]
2668; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
2669; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255]
2670; SSE-NEXT:    pand %xmm10, %xmm0
2671; SSE-NEXT:    movdqa %xmm4, %xmm8
2672; SSE-NEXT:    movdqa %xmm4, %xmm13
2673; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2674; SSE-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
2675; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,5,5,7]
2676; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2677; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2678; SSE-NEXT:    movdqa %xmm10, %xmm2
2679; SSE-NEXT:    pandn %xmm1, %xmm2
2680; SSE-NEXT:    por %xmm0, %xmm2
2681; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
2682; SSE-NEXT:    movdqa %xmm1, %xmm0
2683; SSE-NEXT:    movdqa %xmm1, %xmm11
2684; SSE-NEXT:    pandn %xmm2, %xmm0
2685; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,6,6,6]
2686; SSE-NEXT:    movdqa %xmm3, %xmm4
2687; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2688; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
2689; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255]
2690; SSE-NEXT:    movdqa %xmm9, %xmm3
2691; SSE-NEXT:    pandn %xmm1, %xmm3
2692; SSE-NEXT:    movdqa %xmm7, %xmm2
2693; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
2694; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,1,2,3]
2695; SSE-NEXT:    movdqa %xmm2, %xmm7
2696; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2697; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7]
2698; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
2699; SSE-NEXT:    pand %xmm9, %xmm1
2700; SSE-NEXT:    por %xmm3, %xmm1
2701; SSE-NEXT:    pand %xmm11, %xmm1
2702; SSE-NEXT:    por %xmm0, %xmm1
2703; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0]
2704; SSE-NEXT:    pand %xmm11, %xmm1
2705; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,5,6,6,7]
2706; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2]
2707; SSE-NEXT:    movdqa %xmm11, %xmm3
2708; SSE-NEXT:    pandn %xmm0, %xmm3
2709; SSE-NEXT:    por %xmm1, %xmm3
2710; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
2711; SSE-NEXT:    movdqa %xmm5, %xmm1
2712; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
2713; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
2714; SSE-NEXT:    movdqa %xmm1, %xmm5
2715; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2716; SSE-NEXT:    movdqa %xmm12, %xmm1
2717; SSE-NEXT:    pandn %xmm0, %xmm1
2718; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2719; SSE-NEXT:    pand %xmm12, %xmm3
2720; SSE-NEXT:    por %xmm3, %xmm1
2721; SSE-NEXT:    movdqa 16(%rax), %xmm14
2722; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,6,6]
2723; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
2724; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
2725; SSE-NEXT:    movdqa %xmm3, %xmm2
2726; SSE-NEXT:    pandn %xmm0, %xmm2
2727; SSE-NEXT:    pand %xmm3, %xmm1
2728; SSE-NEXT:    por %xmm1, %xmm2
2729; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2730; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,6,5,7,7]
2731; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2732; SSE-NEXT:    movdqa %xmm10, %xmm1
2733; SSE-NEXT:    pandn %xmm0, %xmm1
2734; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7]
2735; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
2736; SSE-NEXT:    pand %xmm10, %xmm0
2737; SSE-NEXT:    por %xmm0, %xmm1
2738; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
2739; SSE-NEXT:    movdqa %xmm2, %xmm0
2740; SSE-NEXT:    pandn %xmm1, %xmm0
2741; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,6,6,7]
2742; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
2743; SSE-NEXT:    movdqa %xmm3, %xmm4
2744; SSE-NEXT:    pandn %xmm1, %xmm3
2745; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2746; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,7,7,7,7]
2747; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
2748; SSE-NEXT:    pand %xmm4, %xmm1
2749; SSE-NEXT:    por %xmm1, %xmm3
2750; SSE-NEXT:    pand %xmm2, %xmm3
2751; SSE-NEXT:    por %xmm0, %xmm3
2752; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7]
2753; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2754; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
2755; SSE-NEXT:    movdqa %xmm9, %xmm1
2756; SSE-NEXT:    pandn %xmm0, %xmm1
2757; SSE-NEXT:    pand %xmm9, %xmm3
2758; SSE-NEXT:    por %xmm3, %xmm1
2759; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,6,7]
2760; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,2,2,3]
2761; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255]
2762; SSE-NEXT:    movdqa %xmm0, %xmm4
2763; SSE-NEXT:    pandn %xmm3, %xmm4
2764; SSE-NEXT:    pand %xmm0, %xmm1
2765; SSE-NEXT:    por %xmm1, %xmm4
2766; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2767; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,7,7,7]
2768; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2]
2769; SSE-NEXT:    movdqa %xmm11, %xmm2
2770; SSE-NEXT:    pandn %xmm1, %xmm2
2771; SSE-NEXT:    pand %xmm11, %xmm4
2772; SSE-NEXT:    por %xmm4, %xmm2
2773; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2774; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm13[2,1,2,3]
2775; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2776; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,0,3]
2777; SSE-NEXT:    movdqa %xmm12, %xmm4
2778; SSE-NEXT:    pandn %xmm1, %xmm4
2779; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm15[3,3,3,3,4,5,6,7]
2780; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
2781; SSE-NEXT:    pand %xmm12, %xmm1
2782; SSE-NEXT:    por %xmm1, %xmm4
2783; SSE-NEXT:    pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2784; SSE-NEXT:    # xmm1 = mem[2,1,2,3]
2785; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2786; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,0,3]
2787; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7]
2788; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
2789; SSE-NEXT:    movdqa {{.*#+}} xmm15 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255]
2790; SSE-NEXT:    movdqa %xmm15, %xmm7
2791; SSE-NEXT:    pandn %xmm1, %xmm7
2792; SSE-NEXT:    pshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2793; SSE-NEXT:    # xmm1 = mem[3,3,3,3,4,5,6,7]
2794; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
2795; SSE-NEXT:    pand %xmm15, %xmm1
2796; SSE-NEXT:    por %xmm1, %xmm7
2797; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
2798; SSE-NEXT:    movdqa %xmm1, %xmm3
2799; SSE-NEXT:    pandn %xmm7, %xmm3
2800; SSE-NEXT:    pand %xmm1, %xmm4
2801; SSE-NEXT:    por %xmm4, %xmm3
2802; SSE-NEXT:    pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
2803; SSE-NEXT:    # xmm4 = mem[2,1,2,3]
2804; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2805; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0]
2806; SSE-NEXT:    movdqa %xmm9, %xmm7
2807; SSE-NEXT:    pandn %xmm4, %xmm7
2808; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm6[3,3,3,3,4,5,6,7]
2809; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
2810; SSE-NEXT:    pand %xmm9, %xmm4
2811; SSE-NEXT:    por %xmm4, %xmm7
2812; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm14[3,3,3,3,4,5,6,7]
2813; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
2814; SSE-NEXT:    movdqa %xmm0, %xmm8
2815; SSE-NEXT:    pandn %xmm4, %xmm8
2816; SSE-NEXT:    pand %xmm0, %xmm7
2817; SSE-NEXT:    por %xmm7, %xmm8
2818; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
2819; SSE-NEXT:    movdqa %xmm5, %xmm2
2820; SSE-NEXT:    pandn %xmm8, %xmm2
2821; SSE-NEXT:    pand %xmm5, %xmm3
2822; SSE-NEXT:    por %xmm3, %xmm2
2823; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2824; SSE-NEXT:    movdqa (%rsi), %xmm6
2825; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[2,1,2,3]
2826; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2827; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2828; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[3,1,0,3]
2829; SSE-NEXT:    movdqa %xmm12, %xmm3
2830; SSE-NEXT:    pandn %xmm4, %xmm3
2831; SSE-NEXT:    movdqa (%rdi), %xmm13
2832; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm13[3,3,3,3,4,5,6,7]
2833; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2834; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
2835; SSE-NEXT:    pand %xmm12, %xmm4
2836; SSE-NEXT:    por %xmm4, %xmm3
2837; SSE-NEXT:    movdqa (%rcx), %xmm14
2838; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm14[2,1,2,3]
2839; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2840; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2841; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,3,0,3]
2842; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,3,3,3,4,5,6,7]
2843; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
2844; SSE-NEXT:    movdqa %xmm15, %xmm7
2845; SSE-NEXT:    pandn %xmm4, %xmm7
2846; SSE-NEXT:    movdqa (%rdx), %xmm8
2847; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm8[3,3,3,3,4,5,6,7]
2848; SSE-NEXT:    movdqa %xmm8, (%rsp) # 16-byte Spill
2849; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4]
2850; SSE-NEXT:    pand %xmm15, %xmm4
2851; SSE-NEXT:    por %xmm4, %xmm7
2852; SSE-NEXT:    pand %xmm1, %xmm3
2853; SSE-NEXT:    pandn %xmm7, %xmm1
2854; SSE-NEXT:    por %xmm3, %xmm1
2855; SSE-NEXT:    movdqa (%r9), %xmm11
2856; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm11[2,1,2,3]
2857; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2858; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2859; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,2,0]
2860; SSE-NEXT:    movdqa %xmm9, %xmm4
2861; SSE-NEXT:    pandn %xmm3, %xmm4
2862; SSE-NEXT:    movdqa (%r8), %xmm7
2863; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm7[3,3,3,3,4,5,6,7]
2864; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2865; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
2866; SSE-NEXT:    pand %xmm9, %xmm3
2867; SSE-NEXT:    por %xmm3, %xmm4
2868; SSE-NEXT:    pand %xmm0, %xmm4
2869; SSE-NEXT:    movdqa (%rax), %xmm10
2870; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm10[3,3,3,3,4,5,6,7]
2871; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2872; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
2873; SSE-NEXT:    pandn %xmm3, %xmm0
2874; SSE-NEXT:    por %xmm4, %xmm0
2875; SSE-NEXT:    pand %xmm5, %xmm1
2876; SSE-NEXT:    pandn %xmm0, %xmm5
2877; SSE-NEXT:    por %xmm1, %xmm5
2878; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2879; SSE-NEXT:    movdqa %xmm6, %xmm0
2880; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
2881; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2882; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7]
2883; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2884; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255]
2885; SSE-NEXT:    movdqa %xmm2, %xmm1
2886; SSE-NEXT:    pandn %xmm0, %xmm1
2887; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,6,6,6,6]
2888; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
2889; SSE-NEXT:    pand %xmm2, %xmm0
2890; SSE-NEXT:    por %xmm0, %xmm1
2891; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
2892; SSE-NEXT:    movdqa %xmm2, %xmm3
2893; SSE-NEXT:    pandn %xmm1, %xmm3
2894; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,6,6,6,6]
2895; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
2896; SSE-NEXT:    movdqa %xmm9, %xmm1
2897; SSE-NEXT:    pandn %xmm0, %xmm1
2898; SSE-NEXT:    movdqa %xmm14, %xmm0
2899; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15]
2900; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2901; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
2902; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
2903; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
2904; SSE-NEXT:    pand %xmm9, %xmm0
2905; SSE-NEXT:    por %xmm1, %xmm0
2906; SSE-NEXT:    pand %xmm2, %xmm0
2907; SSE-NEXT:    por %xmm3, %xmm0
2908; SSE-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2909; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2910; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[1,1,2,3]
2911; SSE-NEXT:    movdqa %xmm12, %xmm3
2912; SSE-NEXT:    pandn %xmm1, %xmm3
2913; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,5,6,6,7]
2914; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2]
2915; SSE-NEXT:    pand %xmm12, %xmm1
2916; SSE-NEXT:    por %xmm3, %xmm1
2917; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5,6,6]
2918; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3]
2919; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
2920; SSE-NEXT:    movdqa %xmm2, %xmm4
2921; SSE-NEXT:    pandn %xmm3, %xmm4
2922; SSE-NEXT:    pand %xmm2, %xmm1
2923; SSE-NEXT:    por %xmm1, %xmm4
2924; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
2925; SSE-NEXT:    pand %xmm1, %xmm0
2926; SSE-NEXT:    pandn %xmm4, %xmm1
2927; SSE-NEXT:    por %xmm0, %xmm1
2928; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2929; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
2930; SSE-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2931; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3]
2932; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2933; SSE-NEXT:    movdqa %xmm9, %xmm1
2934; SSE-NEXT:    pandn %xmm0, %xmm1
2935; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
2936; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm7[2,1,3,3,4,5,6,7]
2937; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
2938; SSE-NEXT:    pand %xmm9, %xmm0
2939; SSE-NEXT:    por %xmm1, %xmm0
2940; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
2941; SSE-NEXT:    movdqa %xmm13, %xmm1
2942; SSE-NEXT:    pandn %xmm0, %xmm1
2943; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2944; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2945; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,5,6,6,7]
2946; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2947; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
2948; SSE-NEXT:    movdqa %xmm15, %xmm3
2949; SSE-NEXT:    pandn %xmm0, %xmm3
2950; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2951; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7]
2952; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
2953; SSE-NEXT:    pand %xmm15, %xmm0
2954; SSE-NEXT:    por %xmm0, %xmm3
2955; SSE-NEXT:    pand %xmm13, %xmm3
2956; SSE-NEXT:    por %xmm1, %xmm3
2957; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255]
2958; SSE-NEXT:    pandn %xmm3, %xmm0
2959; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2960; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2961; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,6,5,7]
2962; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2963; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2]
2964; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0]
2965; SSE-NEXT:    movdqa %xmm12, %xmm3
2966; SSE-NEXT:    pandn %xmm1, %xmm3
2967; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
2968; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm10[2,2,2,3,4,5,6,7]
2969; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1]
2970; SSE-NEXT:    pand %xmm12, %xmm1
2971; SSE-NEXT:    por %xmm1, %xmm3
2972; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2973; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm5[2,2,2,2,4,5,6,7]
2974; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
2975; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
2976; SSE-NEXT:    movdqa %xmm2, %xmm14
2977; SSE-NEXT:    pandn %xmm1, %xmm14
2978; SSE-NEXT:    pand %xmm2, %xmm3
2979; SSE-NEXT:    por %xmm3, %xmm14
2980; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm14
2981; SSE-NEXT:    por %xmm0, %xmm14
2982; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2983; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,1,2,2]
2984; SSE-NEXT:    movdqa %xmm9, %xmm1
2985; SSE-NEXT:    pandn %xmm0, %xmm1
2986; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm4[1,1,2,3,4,5,6,7]
2987; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
2988; SSE-NEXT:    pand %xmm9, %xmm0
2989; SSE-NEXT:    por %xmm1, %xmm0
2990; SSE-NEXT:    movdqa %xmm13, %xmm1
2991; SSE-NEXT:    pandn %xmm0, %xmm1
2992; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[1,1,2,1]
2993; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,5,6,4]
2994; SSE-NEXT:    movdqa %xmm12, %xmm0
2995; SSE-NEXT:    pandn %xmm3, %xmm0
2996; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm7[1,1,2,2,4,5,6,7]
2997; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1]
2998; SSE-NEXT:    pand %xmm12, %xmm3
2999; SSE-NEXT:    movdqa %xmm12, %xmm11
3000; SSE-NEXT:    por %xmm3, %xmm0
3001; SSE-NEXT:    pand %xmm13, %xmm0
3002; SSE-NEXT:    por %xmm1, %xmm0
3003; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7]
3004; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
3005; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
3006; SSE-NEXT:    movdqa %xmm13, %xmm3
3007; SSE-NEXT:    pandn %xmm1, %xmm3
3008; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm10[1,1,1,1,4,5,6,7]
3009; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
3010; SSE-NEXT:    pand %xmm13, %xmm1
3011; SSE-NEXT:    por %xmm1, %xmm3
3012; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm5[1,1,1,1,4,5,6,7]
3013; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
3014; SSE-NEXT:    movdqa %xmm15, %xmm4
3015; SSE-NEXT:    pandn %xmm1, %xmm4
3016; SSE-NEXT:    pand %xmm15, %xmm3
3017; SSE-NEXT:    por %xmm3, %xmm4
3018; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
3019; SSE-NEXT:    movdqa %xmm2, %xmm1
3020; SSE-NEXT:    pandn %xmm4, %xmm1
3021; SSE-NEXT:    pand %xmm2, %xmm0
3022; SSE-NEXT:    por %xmm0, %xmm1
3023; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3024; SSE-NEXT:    pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3025; SSE-NEXT:    # xmm0 = mem[0,1,1,3]
3026; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
3027; SSE-NEXT:    movdqa %xmm12, %xmm1
3028; SSE-NEXT:    pandn %xmm0, %xmm1
3029; SSE-NEXT:    movdqa (%rsp), %xmm8 # 16-byte Reload
3030; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,5,7]
3031; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2]
3032; SSE-NEXT:    pand %xmm12, %xmm0
3033; SSE-NEXT:    por %xmm1, %xmm0
3034; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0]
3035; SSE-NEXT:    movdqa %xmm6, %xmm3
3036; SSE-NEXT:    pandn %xmm0, %xmm3
3037; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3038; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,5,5,5]
3039; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
3040; SSE-NEXT:    movdqa %xmm9, %xmm4
3041; SSE-NEXT:    pandn %xmm0, %xmm4
3042; SSE-NEXT:    pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3043; SSE-NEXT:    # xmm0 = mem[1,2,2,3,4,5,6,7]
3044; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
3045; SSE-NEXT:    pand %xmm9, %xmm0
3046; SSE-NEXT:    por %xmm4, %xmm0
3047; SSE-NEXT:    pand %xmm6, %xmm0
3048; SSE-NEXT:    por %xmm3, %xmm0
3049; SSE-NEXT:    pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3050; SSE-NEXT:    # xmm3 = mem[1,2,2,3,4,5,6,7]
3051; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
3052; SSE-NEXT:    movdqa %xmm15, %xmm4
3053; SSE-NEXT:    pandn %xmm3, %xmm4
3054; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3055; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,4,6,5]
3056; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3]
3057; SSE-NEXT:    pand %xmm15, %xmm3
3058; SSE-NEXT:    por %xmm3, %xmm4
3059; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3060; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5,5,7]
3061; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
3062; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255]
3063; SSE-NEXT:    movdqa %xmm1, %xmm10
3064; SSE-NEXT:    pandn %xmm3, %xmm10
3065; SSE-NEXT:    pand %xmm1, %xmm4
3066; SSE-NEXT:    por %xmm4, %xmm10
3067; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
3068; SSE-NEXT:    movdqa %xmm2, %xmm1
3069; SSE-NEXT:    pandn %xmm10, %xmm1
3070; SSE-NEXT:    pand %xmm2, %xmm0
3071; SSE-NEXT:    por %xmm0, %xmm1
3072; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3073; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3074; SSE-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3075; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[0,1,2,2]
3076; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3077; SSE-NEXT:    movdqa %xmm9, %xmm3
3078; SSE-NEXT:    pandn %xmm0, %xmm3
3079; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm5[1,1,2,3,4,5,6,7]
3080; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
3081; SSE-NEXT:    pand %xmm9, %xmm0
3082; SSE-NEXT:    por %xmm3, %xmm0
3083; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
3084; SSE-NEXT:    movdqa %xmm5, %xmm3
3085; SSE-NEXT:    pandn %xmm0, %xmm3
3086; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3087; SSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3088; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,2,1]
3089; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3090; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,7,5,6,4]
3091; SSE-NEXT:    movdqa %xmm11, %xmm0
3092; SSE-NEXT:    pandn %xmm4, %xmm0
3093; SSE-NEXT:    movdqa %xmm8, %xmm1
3094; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm8[1,1,2,2,4,5,6,7]
3095; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1]
3096; SSE-NEXT:    pand %xmm11, %xmm4
3097; SSE-NEXT:    por %xmm4, %xmm0
3098; SSE-NEXT:    pand %xmm5, %xmm0
3099; SSE-NEXT:    por %xmm3, %xmm0
3100; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3101; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3102; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7]
3103; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3104; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3]
3105; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
3106; SSE-NEXT:    movdqa %xmm2, %xmm4
3107; SSE-NEXT:    pandn %xmm3, %xmm4
3108; SSE-NEXT:    movdqa %xmm13, %xmm5
3109; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm13[1,1,1,1,4,5,6,7]
3110; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
3111; SSE-NEXT:    pand %xmm2, %xmm3
3112; SSE-NEXT:    por %xmm3, %xmm4
3113; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm12[1,1,1,1,4,5,6,7]
3114; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
3115; SSE-NEXT:    movdqa %xmm15, %xmm10
3116; SSE-NEXT:    pandn %xmm3, %xmm10
3117; SSE-NEXT:    pand %xmm15, %xmm4
3118; SSE-NEXT:    por %xmm4, %xmm10
3119; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
3120; SSE-NEXT:    pand %xmm3, %xmm0
3121; SSE-NEXT:    pandn %xmm10, %xmm3
3122; SSE-NEXT:    por %xmm0, %xmm3
3123; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3124; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm7[0,0,2,1,4,5,6,7]
3125; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
3126; SSE-NEXT:    movdqa %xmm2, %xmm3
3127; SSE-NEXT:    pandn %xmm0, %xmm3
3128; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
3129; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3130; SSE-NEXT:    pand %xmm2, %xmm0
3131; SSE-NEXT:    por %xmm0, %xmm3
3132; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
3133; SSE-NEXT:    movdqa %xmm0, %xmm4
3134; SSE-NEXT:    pandn %xmm3, %xmm4
3135; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm14[0,2,1,3,4,5,6,7]
3136; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm3[0,1,1,0]
3137; SSE-NEXT:    movdqa %xmm11, %xmm3
3138; SSE-NEXT:    pandn %xmm10, %xmm3
3139; SSE-NEXT:    pshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
3140; SSE-NEXT:    # xmm10 = mem[0,0,2,1,4,5,6,7]
3141; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1]
3142; SSE-NEXT:    pand %xmm11, %xmm10
3143; SSE-NEXT:    por %xmm10, %xmm3
3144; SSE-NEXT:    pand %xmm0, %xmm3
3145; SSE-NEXT:    por %xmm4, %xmm3
3146; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm8[0,1,1,3,4,5,6,7]
3147; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1]
3148; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255]
3149; SSE-NEXT:    movdqa %xmm13, %xmm10
3150; SSE-NEXT:    pandn %xmm4, %xmm10
3151; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm5[0,0,0,0,4,5,6,7]
3152; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
3153; SSE-NEXT:    pand %xmm13, %xmm4
3154; SSE-NEXT:    por %xmm4, %xmm10
3155; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm12[0,0,0,0,4,5,6,7]
3156; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
3157; SSE-NEXT:    movdqa %xmm9, %xmm2
3158; SSE-NEXT:    pandn %xmm4, %xmm2
3159; SSE-NEXT:    pand %xmm9, %xmm10
3160; SSE-NEXT:    por %xmm10, %xmm2
3161; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
3162; SSE-NEXT:    movdqa %xmm7, %xmm1
3163; SSE-NEXT:    pandn %xmm2, %xmm1
3164; SSE-NEXT:    pand %xmm7, %xmm3
3165; SSE-NEXT:    por %xmm3, %xmm1
3166; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3167; SSE-NEXT:    pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3168; SSE-NEXT:    # xmm2 = mem[0,1,1,3]
3169; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
3170; SSE-NEXT:    movdqa %xmm1, %xmm3
3171; SSE-NEXT:    pandn %xmm2, %xmm3
3172; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3173; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,5,7]
3174; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2]
3175; SSE-NEXT:    pand %xmm1, %xmm2
3176; SSE-NEXT:    por %xmm3, %xmm2
3177; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3178; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,5,5,5,5]
3179; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
3180; SSE-NEXT:    movdqa %xmm9, %xmm5
3181; SSE-NEXT:    pandn %xmm3, %xmm5
3182; SSE-NEXT:    pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3183; SSE-NEXT:    # xmm3 = mem[1,2,2,3,4,5,6,7]
3184; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1]
3185; SSE-NEXT:    pand %xmm9, %xmm3
3186; SSE-NEXT:    por %xmm5, %xmm3
3187; SSE-NEXT:    pand %xmm6, %xmm3
3188; SSE-NEXT:    pandn %xmm2, %xmm6
3189; SSE-NEXT:    por %xmm3, %xmm6
3190; SSE-NEXT:    pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3191; SSE-NEXT:    # xmm2 = mem[1,2,2,3,4,5,6,7]
3192; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
3193; SSE-NEXT:    movdqa %xmm15, %xmm3
3194; SSE-NEXT:    pandn %xmm2, %xmm3
3195; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3196; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,4,6,5]
3197; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3]
3198; SSE-NEXT:    pand %xmm15, %xmm2
3199; SSE-NEXT:    por %xmm2, %xmm3
3200; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3201; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,5,5,7]
3202; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
3203; SSE-NEXT:    movdqa %xmm13, %xmm5
3204; SSE-NEXT:    pandn %xmm2, %xmm5
3205; SSE-NEXT:    pand %xmm13, %xmm3
3206; SSE-NEXT:    por %xmm3, %xmm5
3207; SSE-NEXT:    movdqa {{.*#+}} xmm14 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
3208; SSE-NEXT:    pand %xmm14, %xmm6
3209; SSE-NEXT:    pandn %xmm5, %xmm14
3210; SSE-NEXT:    por %xmm6, %xmm14
3211; SSE-NEXT:    pshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3212; SSE-NEXT:    # xmm1 = mem[0,0,2,1,4,5,6,7]
3213; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
3214; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
3215; SSE-NEXT:    movdqa %xmm3, %xmm2
3216; SSE-NEXT:    pandn %xmm1, %xmm2
3217; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7]
3218; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
3219; SSE-NEXT:    pand %xmm3, %xmm1
3220; SSE-NEXT:    movdqa %xmm3, %xmm4
3221; SSE-NEXT:    por %xmm1, %xmm2
3222; SSE-NEXT:    pshuflw $216, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3223; SSE-NEXT:    # xmm1 = mem[0,2,1,3,4,5,6,7]
3224; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,1,0]
3225; SSE-NEXT:    movdqa %xmm11, %xmm3
3226; SSE-NEXT:    pandn %xmm1, %xmm3
3227; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm12[0,0,2,1,4,5,6,7]
3228; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1]
3229; SSE-NEXT:    pand %xmm11, %xmm1
3230; SSE-NEXT:    por %xmm1, %xmm3
3231; SSE-NEXT:    pand %xmm0, %xmm3
3232; SSE-NEXT:    pandn %xmm2, %xmm0
3233; SSE-NEXT:    por %xmm3, %xmm0
3234; SSE-NEXT:    pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3235; SSE-NEXT:    # xmm1 = mem[0,1,1,3,4,5,6,7]
3236; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1]
3237; SSE-NEXT:    movdqa %xmm13, %xmm2
3238; SSE-NEXT:    pandn %xmm1, %xmm2
3239; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm8[0,0,0,0,4,5,6,7]
3240; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
3241; SSE-NEXT:    pand %xmm13, %xmm1
3242; SSE-NEXT:    por %xmm1, %xmm2
3243; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm10[0,0,0,0,4,5,6,7]
3244; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
3245; SSE-NEXT:    movdqa %xmm9, %xmm3
3246; SSE-NEXT:    pandn %xmm1, %xmm3
3247; SSE-NEXT:    pand %xmm9, %xmm2
3248; SSE-NEXT:    por %xmm2, %xmm3
3249; SSE-NEXT:    pand %xmm7, %xmm0
3250; SSE-NEXT:    pandn %xmm3, %xmm7
3251; SSE-NEXT:    por %xmm0, %xmm7
3252; SSE-NEXT:    movdqa (%rsp), %xmm5 # 16-byte Reload
3253; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,7,7,7]
3254; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
3255; SSE-NEXT:    pand %xmm13, %xmm0
3256; SSE-NEXT:    pshufhw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3257; SSE-NEXT:    # xmm1 = mem[0,1,2,3,6,5,7,7]
3258; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3259; SSE-NEXT:    pandn %xmm1, %xmm13
3260; SSE-NEXT:    por %xmm0, %xmm13
3261; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3262; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7]
3263; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
3264; SSE-NEXT:    movdqa %xmm4, %xmm3
3265; SSE-NEXT:    pand %xmm4, %xmm0
3266; SSE-NEXT:    pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3267; SSE-NEXT:    # xmm1 = mem[0,1,2,3,4,6,6,7]
3268; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
3269; SSE-NEXT:    pandn %xmm1, %xmm3
3270; SSE-NEXT:    por %xmm0, %xmm3
3271; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
3272; SSE-NEXT:    pand %xmm0, %xmm3
3273; SSE-NEXT:    pandn %xmm13, %xmm0
3274; SSE-NEXT:    por %xmm3, %xmm0
3275; SSE-NEXT:    movdqa %xmm0, %xmm3
3276; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3277; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7]
3278; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
3279; SSE-NEXT:    movdqa %xmm9, %xmm1
3280; SSE-NEXT:    pandn %xmm0, %xmm1
3281; SSE-NEXT:    pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3282; SSE-NEXT:    # xmm0 = mem[0,1,2,3,5,6,6,7]
3283; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3]
3284; SSE-NEXT:    pand %xmm9, %xmm0
3285; SSE-NEXT:    por %xmm1, %xmm0
3286; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3287; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,6,7,7,7]
3288; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2]
3289; SSE-NEXT:    movdqa %xmm11, %xmm2
3290; SSE-NEXT:    pandn %xmm1, %xmm2
3291; SSE-NEXT:    pand %xmm11, %xmm0
3292; SSE-NEXT:    por %xmm0, %xmm2
3293; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0]
3294; SSE-NEXT:    pand %xmm0, %xmm3
3295; SSE-NEXT:    pandn %xmm2, %xmm0
3296; SSE-NEXT:    por %xmm3, %xmm0
3297; SSE-NEXT:    pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3298; SSE-NEXT:    # xmm1 = mem[2,2,3,3]
3299; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7]
3300; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1]
3301; SSE-NEXT:    pand %xmm9, %xmm2
3302; SSE-NEXT:    pandn %xmm1, %xmm9
3303; SSE-NEXT:    por %xmm2, %xmm9
3304; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm6[0,2,2,3,4,5,6,7]
3305; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
3306; SSE-NEXT:    pand %xmm15, %xmm1
3307; SSE-NEXT:    pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3308; SSE-NEXT:    # xmm2 = mem[0,1,2,3,5,6,6,7]
3309; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2]
3310; SSE-NEXT:    pandn %xmm2, %xmm15
3311; SSE-NEXT:    por %xmm1, %xmm15
3312; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
3313; SSE-NEXT:    pand %xmm1, %xmm15
3314; SSE-NEXT:    pandn %xmm9, %xmm1
3315; SSE-NEXT:    por %xmm15, %xmm1
3316; SSE-NEXT:    movdqa %xmm1, %xmm3
3317; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,3,4,5,6,7]
3318; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1]
3319; SSE-NEXT:    pand %xmm11, %xmm1
3320; SSE-NEXT:    pshufhw $216, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3321; SSE-NEXT:    # xmm2 = mem[0,1,2,3,4,6,5,7]
3322; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2]
3323; SSE-NEXT:    pandn %xmm2, %xmm11
3324; SSE-NEXT:    por %xmm1, %xmm11
3325; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
3326; SSE-NEXT:    pand %xmm4, %xmm11
3327; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7]
3328; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
3329; SSE-NEXT:    pandn %xmm1, %xmm4
3330; SSE-NEXT:    por %xmm11, %xmm4
3331; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255]
3332; SSE-NEXT:    pand %xmm1, %xmm4
3333; SSE-NEXT:    pandn %xmm3, %xmm1
3334; SSE-NEXT:    por %xmm1, %xmm4
3335; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3336; SSE-NEXT:    movdqa %xmm4, 32(%rax)
3337; SSE-NEXT:    movdqa %xmm0, 96(%rax)
3338; SSE-NEXT:    movdqa %xmm7, 112(%rax)
3339; SSE-NEXT:    movdqa %xmm14, 176(%rax)
3340; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3341; SSE-NEXT:    movaps %xmm0, (%rax)
3342; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3343; SSE-NEXT:    movaps %xmm0, 16(%rax)
3344; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3345; SSE-NEXT:    movaps %xmm0, 64(%rax)
3346; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3347; SSE-NEXT:    movaps %xmm0, 128(%rax)
3348; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3349; SSE-NEXT:    movaps %xmm0, 144(%rax)
3350; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3351; SSE-NEXT:    movaps %xmm0, 80(%rax)
3352; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3353; SSE-NEXT:    movaps %xmm0, 48(%rax)
3354; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3355; SSE-NEXT:    movaps %xmm0, 160(%rax)
3356; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3357; SSE-NEXT:    movaps %xmm0, 208(%rax)
3358; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3359; SSE-NEXT:    movaps %xmm0, 192(%rax)
3360; SSE-NEXT:    addq $360, %rsp # imm = 0x168
3361; SSE-NEXT:    retq
3362;
3363; AVX-LABEL: store_i8_stride7_vf32:
3364; AVX:       # %bb.0:
3365; AVX-NEXT:    subq $216, %rsp
3366; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3367; AVX-NEXT:    vmovdqa 16(%rax), %xmm14
3368; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm14[u,u,u],zero,zero,xmm14[9,u,u,u,u],zero,zero,xmm14[10,u,u,u]
3369; AVX-NEXT:    vmovdqa 16(%r9), %xmm2
3370; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3371; AVX-NEXT:    vmovdqa 16(%r8), %xmm3
3372; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3373; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
3374; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3375; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,3],zero,xmm1[u,u,u,u,4,5],zero,xmm1[u,u,u]
3376; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
3377; AVX-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,128,7,u,u,u,u,u,128,8,u,u]
3378; AVX-NEXT:    vpshufb %xmm11, %xmm2, %xmm1
3379; AVX-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,u,u,u,u,7,128,u,u,u,u,u,8,128,u,u]
3380; AVX-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
3381; AVX-NEXT:    vpor %xmm1, %xmm3, %xmm1
3382; AVX-NEXT:    vmovdqa {{.*#+}} xmm8 = [128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u]
3383; AVX-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
3384; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u]
3385; AVX-NEXT:    vpshufb %xmm5, %xmm14, %xmm3
3386; AVX-NEXT:    vpor %xmm3, %xmm1, %xmm1
3387; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm7
3388; AVX-NEXT:    vmovdqa 16(%rcx), %xmm1
3389; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3390; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,u,128,7,u,u,u,u,u,128,8,u,u,u,u]
3391; AVX-NEXT:    vpshufb %xmm4, %xmm1, %xmm0
3392; AVX-NEXT:    vmovdqa 16(%rdx), %xmm2
3393; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3394; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,7,128,u,u,u,u,u,8,128,u,u,u,u]
3395; AVX-NEXT:    vpshufb %xmm9, %xmm2, %xmm3
3396; AVX-NEXT:    vpor %xmm0, %xmm3, %xmm0
3397; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
3398; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3399; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6]
3400; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm15
3401; AVX-NEXT:    vmovdqa 16(%rsi), %xmm10
3402; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,128,7,u,u,u,u,u,128,8,u,u,u,u,u,128]
3403; AVX-NEXT:    vpshufb %xmm1, %xmm10, %xmm0
3404; AVX-NEXT:    vmovdqa 16(%rdi), %xmm6
3405; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm6[u,7],zero,xmm6[u,u,u,u,u,8],zero,xmm6[u,u,u,u,u,9]
3406; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
3407; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm6[8],xmm10[9],xmm6[9],xmm10[10],xmm6[10],xmm10[11],xmm6[11],xmm10[12],xmm6[12],xmm10[13],xmm6[13],xmm10[14],xmm6[14],xmm10[15],xmm6[15]
3408; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u]
3409; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3410; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0]
3411; AVX-NEXT:    vandnps %ymm15, %ymm2, %ymm15
3412; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
3413; AVX-NEXT:    vorps %ymm0, %ymm15, %ymm0
3414; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
3415; AVX-NEXT:    vandnps %ymm7, %ymm2, %ymm7
3416; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
3417; AVX-NEXT:    vorps %ymm7, %ymm0, %ymm0
3418; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3419; AVX-NEXT:    vmovdqa (%r9), %xmm7
3420; AVX-NEXT:    vpshufb %xmm11, %xmm7, %xmm0
3421; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3422; AVX-NEXT:    vmovdqa (%r8), %xmm3
3423; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3424; AVX-NEXT:    vpshufb %xmm12, %xmm3, %xmm2
3425; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
3426; AVX-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
3427; AVX-NEXT:    vmovdqa (%rax), %xmm8
3428; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3429; AVX-NEXT:    vpshufb %xmm5, %xmm8, %xmm2
3430; AVX-NEXT:    vpor %xmm2, %xmm0, %xmm0
3431; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128]
3432; AVX-NEXT:    vpshufb %xmm5, %xmm8, %xmm2
3433; AVX-NEXT:    vmovdqa %xmm5, %xmm8
3434; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
3435; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3436; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm3[8,9],zero,xmm3[u,u,u,u,10,11],zero,xmm3[u,u,u,u,12,13]
3437; AVX-NEXT:    vpor %xmm2, %xmm5, %xmm2
3438; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm15
3439; AVX-NEXT:    vmovdqa (%rcx), %xmm3
3440; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3441; AVX-NEXT:    vpshufb %xmm4, %xmm3, %xmm0
3442; AVX-NEXT:    vmovdqa (%rdx), %xmm13
3443; AVX-NEXT:    vpshufb %xmm9, %xmm13, %xmm2
3444; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
3445; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
3446; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3447; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u]
3448; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
3449; AVX-NEXT:    vmovdqa (%rsi), %xmm5
3450; AVX-NEXT:    vpshufb %xmm1, %xmm5, %xmm2
3451; AVX-NEXT:    vmovdqa (%rdi), %xmm3
3452; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm3[u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u,u,9]
3453; AVX-NEXT:    vpor %xmm2, %xmm11, %xmm2
3454; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
3455; AVX-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3456; AVX-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u]
3457; AVX-NEXT:    vpshufb %xmm12, %xmm9, %xmm11
3458; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm11, %ymm11
3459; AVX-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
3460; AVX-NEXT:    vandnps %ymm0, %ymm4, %ymm0
3461; AVX-NEXT:    vandps %ymm4, %ymm11, %ymm11
3462; AVX-NEXT:    vorps %ymm0, %ymm11, %ymm0
3463; AVX-NEXT:    vmovaps {{.*#+}} ymm11 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
3464; AVX-NEXT:    vandnps %ymm15, %ymm11, %ymm15
3465; AVX-NEXT:    vandps %ymm0, %ymm11, %ymm0
3466; AVX-NEXT:    vorps %ymm0, %ymm15, %ymm0
3467; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3468; AVX-NEXT:    vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3469; AVX-NEXT:    vpshufb %xmm8, %xmm14, %xmm0
3470; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3471; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3472; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
3473; AVX-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
3474; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm1[8,9],zero,xmm1[u,u,u,u,10,11],zero,xmm1[u,u,u,u,12,13]
3475; AVX-NEXT:    vpor %xmm0, %xmm11, %xmm0
3476; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm14[u,u],zero,zero,xmm14[2,u,u,u,u],zero,zero,xmm14[3,u,u,u,u]
3477; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = xmm1[u,u,4,5],zero,xmm1[u,u,u,u,6,7],zero,xmm1[u,u,u,u]
3478; AVX-NEXT:    vpor %xmm11, %xmm15, %xmm11
3479; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm11, %ymm15
3480; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
3481; AVX-NEXT:    vpshufb %xmm12, %xmm11, %xmm0
3482; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u]
3483; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm12, %ymm12
3484; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3485; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3486; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
3487; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm9[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u]
3488; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm9[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9]
3489; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm14, %ymm0
3490; AVX-NEXT:    vandnps %ymm12, %ymm4, %ymm12
3491; AVX-NEXT:    vandps %ymm4, %ymm0, %ymm0
3492; AVX-NEXT:    vorps %ymm0, %ymm12, %ymm0
3493; AVX-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0]
3494; AVX-NEXT:    vandnps %ymm15, %ymm4, %ymm12
3495; AVX-NEXT:    vandps %ymm4, %ymm0, %ymm0
3496; AVX-NEXT:    vorps %ymm0, %ymm12, %ymm0
3497; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3498; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15]
3499; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u]
3500; AVX-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u]
3501; AVX-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
3502; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
3503; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
3504; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3505; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u]
3506; AVX-NEXT:    vmovdqa {{.*#+}} xmm15 = [6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u]
3507; AVX-NEXT:    vpshufb %xmm15, %xmm4, %xmm4
3508; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm4, %ymm6
3509; AVX-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
3510; AVX-NEXT:    vandnps %ymm0, %ymm4, %ymm0
3511; AVX-NEXT:    vandps %ymm4, %ymm6, %ymm6
3512; AVX-NEXT:    vorps %ymm0, %ymm6, %ymm0
3513; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
3514; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3515; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,6,7,u,u,u,u,u,8,9,u,u,u,u,u,10]
3516; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u]
3517; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm12, %ymm6
3518; AVX-NEXT:    vmovaps {{.*#+}} ymm12 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
3519; AVX-NEXT:    vandps %ymm0, %ymm12, %ymm0
3520; AVX-NEXT:    vandnps %ymm6, %ymm12, %ymm6
3521; AVX-NEXT:    vorps %ymm6, %ymm0, %ymm0
3522; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm6
3523; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[0],zero,xmm6[2,3,4,5,6,7],zero,xmm6[9,10,11,12,13,14],zero
3524; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3525; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = zero,xmm14[13],zero,zero,zero,zero,zero,zero,xmm14[14],zero,zero,zero,zero,zero,zero,xmm14[15]
3526; AVX-NEXT:    vpor %xmm6, %xmm12, %xmm1
3527; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3528; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6,7,8,9],zero,xmm0[11,12,13,14,15]
3529; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm14[11],zero,zero,zero,zero,zero,zero,xmm14[12],zero,zero,zero,zero,zero
3530; AVX-NEXT:    vpor %xmm6, %xmm0, %xmm0
3531; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
3532; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
3533; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u]
3534; AVX-NEXT:    vpshufb %xmm10, %xmm12, %xmm2
3535; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
3536; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3537; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15]
3538; AVX-NEXT:    vpshufb %xmm15, %xmm2, %xmm2
3539; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm13[8],xmm3[8],xmm13[9],xmm3[9],xmm13[10],xmm3[10],xmm13[11],xmm3[11],xmm13[12],xmm3[12],xmm13[13],xmm3[13],xmm13[14],xmm3[14],xmm13[15],xmm3[15]
3540; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm3[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6]
3541; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
3542; AVX-NEXT:    vmovaps {{.*#+}} ymm5 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
3543; AVX-NEXT:    vandnps %ymm1, %ymm5, %ymm1
3544; AVX-NEXT:    vandps %ymm5, %ymm2, %ymm2
3545; AVX-NEXT:    vorps %ymm1, %ymm2, %ymm1
3546; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3547; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,zero,xmm8[9,u,u,u,u],zero,zero,xmm8[10,u,u,u]
3548; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3549; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3550; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15]
3551; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,2,3],zero,xmm5[u,u,u,u,4,5],zero,xmm5[u,u,u]
3552; AVX-NEXT:    vpor %xmm2, %xmm6, %xmm2
3553; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,6,7],zero,xmm5[u,u,u,u,8,9],zero,xmm5[u,u,u,u,10]
3554; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm8[u],zero,zero,xmm8[11,u,u,u,u],zero,zero,xmm8[12,u,u,u,u],zero
3555; AVX-NEXT:    vpor %xmm6, %xmm5, %xmm5
3556; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
3557; AVX-NEXT:    vmovaps {{.*#+}} ymm5 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
3558; AVX-NEXT:    vandps %ymm5, %ymm1, %ymm1
3559; AVX-NEXT:    vandnps %ymm2, %ymm5, %ymm2
3560; AVX-NEXT:    vorps %ymm2, %ymm1, %ymm6
3561; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u]
3562; AVX-NEXT:    vpmovsxdq {{.*#+}} xmm2 = [16777216,197120]
3563; AVX-NEXT:    vpshufb %xmm2, %xmm9, %xmm3
3564; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
3565; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm12[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u]
3566; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5]
3567; AVX-NEXT:    vpshufb %xmm3, %xmm11, %xmm7
3568; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm5, %ymm5
3569; AVX-NEXT:    vandnps %ymm1, %ymm4, %ymm1
3570; AVX-NEXT:    vandps %ymm4, %ymm5, %ymm4
3571; AVX-NEXT:    vorps %ymm1, %ymm4, %ymm4
3572; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u]
3573; AVX-NEXT:    vmovdqa (%rsp), %xmm5 # 16-byte Reload
3574; AVX-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
3575; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm5[4,5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3]
3576; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15]
3577; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[10],zero,xmm7[u,u,u,u,13,12],zero,xmm7[u,u,u,u,15,14],zero
3578; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = zero,xmm8[13,u,u,u,u],zero,zero,xmm8[14,u,u,u,u],zero,zero,xmm8[15]
3579; AVX-NEXT:    vpor %xmm7, %xmm9, %xmm7
3580; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u]
3581; AVX-NEXT:    vpshufb %xmm9, %xmm5, %xmm5
3582; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm7, %ymm5
3583; AVX-NEXT:    vmovaps {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
3584; AVX-NEXT:    vandps %ymm7, %ymm4, %ymm4
3585; AVX-NEXT:    vandnps %ymm5, %ymm7, %ymm5
3586; AVX-NEXT:    vorps %ymm5, %ymm4, %ymm5
3587; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3588; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm7[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9]
3589; AVX-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
3590; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
3591; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3592; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u]
3593; AVX-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
3594; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
3595; AVX-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0]
3596; AVX-NEXT:    vandnps %ymm2, %ymm4, %ymm2
3597; AVX-NEXT:    vandps %ymm4, %ymm3, %ymm3
3598; AVX-NEXT:    vorps %ymm2, %ymm3, %ymm2
3599; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm8[u,u],zero,zero,xmm8[2,u,u,u,u],zero,zero,xmm8[3,u,u,u,u]
3600; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3601; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5],zero,xmm7[u,u,u,u,6,7],zero,xmm7[u,u,u,u]
3602; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
3603; AVX-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
3604; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3]
3605; AVX-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
3606; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
3607; AVX-NEXT:    vmovaps {{.*#+}} ymm3 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
3608; AVX-NEXT:    vandps %ymm3, %ymm2, %ymm2
3609; AVX-NEXT:    vandnps %ymm1, %ymm3, %ymm1
3610; AVX-NEXT:    vorps %ymm1, %ymm2, %ymm1
3611; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3612; AVX-NEXT:    vmovaps %ymm1, (%rax)
3613; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3614; AVX-NEXT:    vmovaps %ymm1, 128(%rax)
3615; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3616; AVX-NEXT:    vmovaps %ymm1, 32(%rax)
3617; AVX-NEXT:    vmovaps %ymm5, 96(%rax)
3618; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3619; AVX-NEXT:    vmovaps %ymm1, 160(%rax)
3620; AVX-NEXT:    vmovaps %ymm6, 64(%rax)
3621; AVX-NEXT:    vmovdqa %xmm0, 192(%rax)
3622; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3623; AVX-NEXT:    vmovaps %xmm0, 208(%rax)
3624; AVX-NEXT:    addq $216, %rsp
3625; AVX-NEXT:    vzeroupper
3626; AVX-NEXT:    retq
3627;
3628; AVX2-LABEL: store_i8_stride7_vf32:
3629; AVX2:       # %bb.0:
3630; AVX2-NEXT:    pushq %rax
3631; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3632; AVX2-NEXT:    vmovdqa (%rdi), %ymm4
3633; AVX2-NEXT:    vmovdqa (%rsi), %ymm6
3634; AVX2-NEXT:    vmovdqa (%rdx), %ymm3
3635; AVX2-NEXT:    vmovdqa (%rcx), %ymm5
3636; AVX2-NEXT:    vmovdqa (%r8), %ymm7
3637; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3638; AVX2-NEXT:    vmovdqa (%r9), %ymm2
3639; AVX2-NEXT:    vmovdqa (%rax), %ymm1
3640; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3641; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u]
3642; AVX2-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
3643; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,3,4,6,7,7]
3644; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0]
3645; AVX2-NEXT:    # ymm9 = mem[0,1,0,1]
3646; AVX2-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
3647; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
3648; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u]
3649; AVX2-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
3650; AVX2-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7]
3651; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0]
3652; AVX2-NEXT:    # ymm10 = mem[0,1,0,1]
3653; AVX2-NEXT:    vpblendvb %ymm10, %ymm9, %ymm8, %ymm8
3654; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
3655; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u]
3656; AVX2-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
3657; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,ymm7[27,28,29,30],zero,ymm7[28],zero,ymm7[26,27,30,31],zero,ymm7[29]
3658; AVX2-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero
3659; AVX2-NEXT:    vpor %ymm8, %ymm9, %ymm8
3660; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
3661; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u]
3662; AVX2-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
3663; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
3664; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
3665; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0]
3666; AVX2-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
3667; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3668; AVX2-NEXT:    vmovdqa (%rdx), %xmm10
3669; AVX2-NEXT:    vmovdqa (%rcx), %xmm11
3670; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
3671; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
3672; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm0[0,1,0,1]
3673; AVX2-NEXT:    vmovdqa (%rdi), %xmm14
3674; AVX2-NEXT:    vmovdqa (%rsi), %xmm0
3675; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15]
3676; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
3677; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
3678; AVX2-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u]
3679; AVX2-NEXT:    vpblendvb %ymm12, %ymm8, %ymm9, %ymm8
3680; AVX2-NEXT:    vmovdqa (%r9), %xmm12
3681; AVX2-NEXT:    vmovdqa (%r8), %xmm13
3682; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
3683; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
3684; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
3685; AVX2-NEXT:    vmovdqa (%rax), %xmm15
3686; AVX2-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,5,5,6]
3687; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3]
3688; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1]
3689; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255]
3690; AVX2-NEXT:    vpblendvb %ymm1, %ymm9, %ymm7, %ymm1
3691; AVX2-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
3692; AVX2-NEXT:    vpblendvb %ymm7, %ymm8, %ymm1, %ymm1
3693; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3694; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[u],zero,xmm0[7],zero,xmm0[5,u,u,u],zero,xmm0[8],zero,xmm0[6,u,u,u],zero
3695; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = xmm14[u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero,xmm14[u,u,u,9]
3696; AVX2-NEXT:    vpor %xmm1, %xmm7, %xmm1
3697; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = xmm11[u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u]
3698; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u]
3699; AVX2-NEXT:    vpor %xmm7, %xmm9, %xmm7
3700; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
3701; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1]
3702; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255]
3703; AVX2-NEXT:    vpblendvb %ymm9, %ymm1, %ymm7, %ymm1
3704; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = zero,xmm12[4,u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6]
3705; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero
3706; AVX2-NEXT:    vpor %xmm7, %xmm9, %xmm7
3707; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = xmm15[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
3708; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0]
3709; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1]
3710; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u]
3711; AVX2-NEXT:    vpblendvb %ymm8, %ymm7, %ymm9, %ymm7
3712; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
3713; AVX2-NEXT:    vpblendvb %ymm8, %ymm1, %ymm7, %ymm9
3714; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
3715; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
3716; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
3717; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
3718; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
3719; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
3720; AVX2-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0]
3721; AVX2-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
3722; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
3723; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
3724; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
3725; AVX2-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm15[1,1,0,0,4,5,6,7]
3726; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,0]
3727; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0]
3728; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u]
3729; AVX2-NEXT:    vpblendvb %ymm8, %ymm1, %ymm7, %ymm1
3730; AVX2-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
3731; AVX2-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm10
3732; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u]
3733; AVX2-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
3734; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5]
3735; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0]
3736; AVX2-NEXT:    # ymm7 = mem[0,1,0,1]
3737; AVX2-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
3738; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
3739; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero,ymm5[20]
3740; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero
3741; AVX2-NEXT:    vpor %ymm1, %ymm7, %ymm1
3742; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
3743; AVX2-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u]
3744; AVX2-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
3745; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22]
3746; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
3747; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm13[18],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero
3748; AVX2-NEXT:    vpor %ymm1, %ymm7, %ymm1
3749; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
3750; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
3751; AVX2-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm12[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
3752; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7]
3753; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,3,2]
3754; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255]
3755; AVX2-NEXT:    vpblendvb %ymm8, %ymm1, %ymm7, %ymm1
3756; AVX2-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0]
3757; AVX2-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
3758; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[23],zero,ymm4[27,20,21,26],zero,ymm4[24],zero,ymm4[26,27,26,27],zero,ymm4[25]
3759; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27],zero
3760; AVX2-NEXT:    vpor %ymm1, %ymm7, %ymm1
3761; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
3762; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero
3763; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27]
3764; AVX2-NEXT:    vpor %ymm7, %ymm8, %ymm7
3765; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3]
3766; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0]
3767; AVX2-NEXT:    vpblendvb %ymm8, %ymm1, %ymm7, %ymm1
3768; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
3769; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero
3770; AVX2-NEXT:    vpor %ymm7, %ymm8, %ymm7
3771; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3]
3772; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
3773; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
3774; AVX2-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u]
3775; AVX2-NEXT:    vpblendvb %ymm11, %ymm7, %ymm8, %ymm7
3776; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
3777; AVX2-NEXT:    vpblendvb %ymm8, %ymm1, %ymm7, %ymm1
3778; AVX2-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18]
3779; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero,zero,ymm4[18],zero
3780; AVX2-NEXT:    vpor %ymm6, %ymm4, %ymm4
3781; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero,zero,zero
3782; AVX2-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero
3783; AVX2-NEXT:    vpor %ymm5, %ymm3, %ymm3
3784; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255]
3785; AVX2-NEXT:    vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
3786; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = zero,ymm13[1,2,3,0,1,14],zero,ymm13[0,1,0,1,14,15],zero,ymm13[15,16,17,18,19,16],zero,ymm13[30,31,16,17,16,17],zero,ymm13[31,30,31]
3787; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[13],zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero
3788; AVX2-NEXT:    vpor %ymm4, %ymm2, %ymm2
3789; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = ymm12[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
3790; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u]
3791; AVX2-NEXT:    vpblendvb %ymm5, %ymm2, %ymm4, %ymm2
3792; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
3793; AVX2-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
3794; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3795; AVX2-NEXT:    vmovdqa %ymm2, 96(%rax)
3796; AVX2-NEXT:    vmovdqa %ymm1, 160(%rax)
3797; AVX2-NEXT:    vmovdqa %ymm10, (%rax)
3798; AVX2-NEXT:    vmovdqa %ymm0, 128(%rax)
3799; AVX2-NEXT:    vmovdqa %ymm9, 32(%rax)
3800; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3801; AVX2-NEXT:    vmovaps %ymm0, 64(%rax)
3802; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3803; AVX2-NEXT:    vmovaps %ymm0, 192(%rax)
3804; AVX2-NEXT:    popq %rax
3805; AVX2-NEXT:    vzeroupper
3806; AVX2-NEXT:    retq
3807;
3808; AVX2-FP-LABEL: store_i8_stride7_vf32:
3809; AVX2-FP:       # %bb.0:
3810; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3811; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm1
3812; AVX2-FP-NEXT:    vmovdqa (%rsi), %ymm3
3813; AVX2-FP-NEXT:    vmovdqa (%rdx), %ymm0
3814; AVX2-FP-NEXT:    vmovdqa (%rcx), %ymm2
3815; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm9
3816; AVX2-FP-NEXT:    vmovdqa (%rcx), %xmm10
3817; AVX2-FP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
3818; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
3819; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
3820; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm13
3821; AVX2-FP-NEXT:    vmovdqa (%rsi), %xmm15
3822; AVX2-FP-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15]
3823; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
3824; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
3825; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u]
3826; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
3827; AVX2-FP-NEXT:    vmovdqa (%rax), %xmm11
3828; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm11[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13]
3829; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
3830; AVX2-FP-NEXT:    vmovdqa (%r9), %xmm12
3831; AVX2-FP-NEXT:    vmovdqa (%r8), %xmm14
3832; AVX2-FP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15]
3833; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
3834; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
3835; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255]
3836; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm6, %ymm5, %ymm5
3837; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
3838; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
3839; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm15[u],zero,xmm15[7],zero,xmm15[5,u,u,u],zero,xmm15[8],zero,xmm15[6,u,u,u],zero
3840; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm13[u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u,u,9]
3841; AVX2-FP-NEXT:    vpor %xmm5, %xmm6, %xmm5
3842; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
3843; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm10[u,u,u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6,u,u]
3844; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u]
3845; AVX2-FP-NEXT:    vpor %xmm6, %xmm7, %xmm6
3846; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
3847; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255]
3848; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
3849; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
3850; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0]
3851; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,xmm12[4,u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6]
3852; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero
3853; AVX2-FP-NEXT:    vpor %xmm7, %xmm8, %xmm7
3854; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1]
3855; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u]
3856; AVX2-FP-NEXT:    vpblendvb %ymm8, %ymm7, %ymm6, %ymm7
3857; AVX2-FP-NEXT:    vmovdqa (%r8), %ymm6
3858; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
3859; AVX2-FP-NEXT:    vpblendvb %ymm8, %ymm5, %ymm7, %ymm5
3860; AVX2-FP-NEXT:    vmovdqa (%r9), %ymm8
3861; AVX2-FP-NEXT:    vmovdqa (%rax), %ymm7
3862; AVX2-FP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
3863; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
3864; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1]
3865; AVX2-FP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
3866; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
3867; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
3868; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0]
3869; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm13, %ymm9, %ymm9
3870; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm11[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3]
3871; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0]
3872; AVX2-FP-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7]
3873; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
3874; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1]
3875; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u]
3876; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm11, %ymm10, %ymm10
3877; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
3878; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm9, %ymm10, %ymm9
3879; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[17,18,19,30],zero,ymm0[28],zero,ymm0[28,29,30,31],zero,ymm0[29],zero,ymm0[31]
3880; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero
3881; AVX2-FP-NEXT:    vpor %ymm10, %ymm11, %ymm10
3882; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
3883; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero
3884; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero,zero
3885; AVX2-FP-NEXT:    vpor %ymm11, %ymm12, %ymm11
3886; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
3887; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u]
3888; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm10, %ymm11, %ymm10
3889; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,ymm6[27,28,29,30],zero,ymm6[28],zero,ymm6[26,27,30,31],zero,ymm6[29]
3890; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm8[27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero
3891; AVX2-FP-NEXT:    vpor %ymm11, %ymm12, %ymm11
3892; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
3893; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u]
3894; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm10, %ymm11, %ymm10
3895; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
3896; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
3897; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0]
3898; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm10, %ymm11, %ymm10
3899; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25]
3900; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero
3901; AVX2-FP-NEXT:    vpor %ymm11, %ymm12, %ymm11
3902; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
3903; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero
3904; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27]
3905; AVX2-FP-NEXT:    vpor %ymm12, %ymm13, %ymm12
3906; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
3907; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0]
3908; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
3909; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero
3910; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero
3911; AVX2-FP-NEXT:    vpor %ymm12, %ymm13, %ymm12
3912; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
3913; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
3914; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3]
3915; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u]
3916; AVX2-FP-NEXT:    vpblendvb %ymm14, %ymm12, %ymm13, %ymm12
3917; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
3918; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
3919; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20]
3920; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero
3921; AVX2-FP-NEXT:    vpor %ymm12, %ymm13, %ymm12
3922; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
3923; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero,zero
3924; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero,zero
3925; AVX2-FP-NEXT:    vpor %ymm13, %ymm14, %ymm13
3926; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3]
3927; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm14 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u]
3928; AVX2-FP-NEXT:    vpblendvb %ymm14, %ymm12, %ymm13, %ymm12
3929; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[20],zero,ymm8[18],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22]
3930; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero
3931; AVX2-FP-NEXT:    vpor %ymm13, %ymm14, %ymm13
3932; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3]
3933; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31]
3934; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2]
3935; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255]
3936; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm13, %ymm14, %ymm13
3937; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm14 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0]
3938; AVX2-FP-NEXT:    vpblendvb %ymm14, %ymm12, %ymm13, %ymm12
3939; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18]
3940; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18],zero
3941; AVX2-FP-NEXT:    vpor %ymm3, %ymm1, %ymm1
3942; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero
3943; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero
3944; AVX2-FP-NEXT:    vpor %ymm2, %ymm0, %ymm0
3945; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255]
3946; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
3947; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31]
3948; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero
3949; AVX2-FP-NEXT:    vpor %ymm1, %ymm2, %ymm1
3950; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
3951; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u]
3952; AVX2-FP-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
3953; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
3954; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
3955; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3956; AVX2-FP-NEXT:    vmovdqa %ymm0, 96(%rax)
3957; AVX2-FP-NEXT:    vmovdqa %ymm12, 128(%rax)
3958; AVX2-FP-NEXT:    vmovdqa %ymm11, 160(%rax)
3959; AVX2-FP-NEXT:    vmovdqa %ymm9, (%rax)
3960; AVX2-FP-NEXT:    vmovdqa %ymm10, 192(%rax)
3961; AVX2-FP-NEXT:    vmovdqa %ymm5, 32(%rax)
3962; AVX2-FP-NEXT:    vmovdqa %ymm4, 64(%rax)
3963; AVX2-FP-NEXT:    vzeroupper
3964; AVX2-FP-NEXT:    retq
3965;
3966; AVX2-FCP-LABEL: store_i8_stride7_vf32:
3967; AVX2-FCP:       # %bb.0:
3968; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3969; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm1
3970; AVX2-FCP-NEXT:    vmovdqa (%rsi), %ymm3
3971; AVX2-FCP-NEXT:    vmovdqa (%rdx), %ymm0
3972; AVX2-FCP-NEXT:    vmovdqa (%rcx), %ymm2
3973; AVX2-FCP-NEXT:    vmovdqa (%rdx), %xmm9
3974; AVX2-FCP-NEXT:    vmovdqa (%rcx), %xmm10
3975; AVX2-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
3976; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
3977; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
3978; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm13
3979; AVX2-FCP-NEXT:    vmovdqa (%rsi), %xmm15
3980; AVX2-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15]
3981; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
3982; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
3983; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u]
3984; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
3985; AVX2-FCP-NEXT:    vmovdqa (%rax), %xmm11
3986; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm11[0,1,2,3,4,5,5,6]
3987; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3]
3988; AVX2-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
3989; AVX2-FCP-NEXT:    vpermd %ymm5, %ymm6, %ymm5
3990; AVX2-FCP-NEXT:    vmovdqa (%r9), %xmm12
3991; AVX2-FCP-NEXT:    vmovdqa (%r8), %xmm14
3992; AVX2-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15]
3993; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
3994; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
3995; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255]
3996; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm6, %ymm5, %ymm5
3997; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
3998; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
3999; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm15[u],zero,xmm15[7],zero,xmm15[5,u,u,u],zero,xmm15[8],zero,xmm15[6,u,u,u],zero
4000; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm13[u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u,u,9]
4001; AVX2-FCP-NEXT:    vpor %xmm5, %xmm6, %xmm5
4002; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
4003; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm10[u,u,u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6,u,u]
4004; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u]
4005; AVX2-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
4006; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
4007; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255]
4008; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
4009; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
4010; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0]
4011; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,xmm12[4,u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6]
4012; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero
4013; AVX2-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
4014; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1]
4015; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u]
4016; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm7, %ymm6, %ymm7
4017; AVX2-FCP-NEXT:    vmovdqa (%r8), %ymm6
4018; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
4019; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm5, %ymm7, %ymm5
4020; AVX2-FCP-NEXT:    vmovdqa (%r9), %ymm8
4021; AVX2-FCP-NEXT:    vmovdqa (%rax), %ymm7
4022; AVX2-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
4023; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
4024; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1]
4025; AVX2-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
4026; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
4027; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
4028; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0]
4029; AVX2-FCP-NEXT:    vpblendvb %ymm10, %ymm13, %ymm9, %ymm9
4030; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm11[1,1,0,0,4,5,6,7]
4031; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,1,0,1,2,0,0,1]
4032; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm11, %ymm10
4033; AVX2-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7]
4034; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
4035; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1]
4036; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u]
4037; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm11, %ymm10, %ymm10
4038; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
4039; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm9, %ymm10, %ymm9
4040; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20]
4041; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero
4042; AVX2-FCP-NEXT:    vpor %ymm10, %ymm11, %ymm10
4043; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
4044; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero,zero
4045; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero,zero
4046; AVX2-FCP-NEXT:    vpor %ymm11, %ymm12, %ymm11
4047; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
4048; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u]
4049; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm10, %ymm11, %ymm10
4050; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm7[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
4051; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [4,5,4,5,5,7,4,5]
4052; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm12, %ymm11
4053; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[20],zero,ymm8[18],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22]
4054; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero
4055; AVX2-FCP-NEXT:    vpor %ymm12, %ymm13, %ymm12
4056; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
4057; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255]
4058; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm12, %ymm11, %ymm11
4059; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0]
4060; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm10, %ymm11, %ymm10
4061; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[17,18,19,30],zero,ymm0[28],zero,ymm0[28,29,30,31],zero,ymm0[29],zero,ymm0[31]
4062; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero
4063; AVX2-FCP-NEXT:    vpor %ymm11, %ymm12, %ymm11
4064; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
4065; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero
4066; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero,zero
4067; AVX2-FCP-NEXT:    vpor %ymm12, %ymm13, %ymm12
4068; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
4069; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u]
4070; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
4071; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,ymm6[27,28,29,30],zero,ymm6[28],zero,ymm6[26,27,30,31],zero,ymm6[29]
4072; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm8[27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero
4073; AVX2-FCP-NEXT:    vpor %ymm12, %ymm13, %ymm12
4074; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
4075; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u]
4076; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
4077; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
4078; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
4079; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0]
4080; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
4081; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25]
4082; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero
4083; AVX2-FCP-NEXT:    vpor %ymm12, %ymm13, %ymm12
4084; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
4085; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero
4086; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27]
4087; AVX2-FCP-NEXT:    vpor %ymm13, %ymm14, %ymm13
4088; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3]
4089; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0]
4090; AVX2-FCP-NEXT:    vpblendvb %ymm14, %ymm12, %ymm13, %ymm12
4091; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero
4092; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero
4093; AVX2-FCP-NEXT:    vpor %ymm13, %ymm14, %ymm13
4094; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3]
4095; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
4096; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3]
4097; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u]
4098; AVX2-FCP-NEXT:    vpblendvb %ymm15, %ymm13, %ymm14, %ymm13
4099; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
4100; AVX2-FCP-NEXT:    vpblendvb %ymm14, %ymm12, %ymm13, %ymm12
4101; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18]
4102; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18],zero
4103; AVX2-FCP-NEXT:    vpor %ymm3, %ymm1, %ymm1
4104; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero
4105; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero
4106; AVX2-FCP-NEXT:    vpor %ymm2, %ymm0, %ymm0
4107; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255]
4108; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
4109; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31]
4110; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero
4111; AVX2-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm1
4112; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
4113; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u]
4114; AVX2-FCP-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
4115; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
4116; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
4117; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4118; AVX2-FCP-NEXT:    vmovdqa %ymm0, 96(%rax)
4119; AVX2-FCP-NEXT:    vmovdqa %ymm10, 128(%rax)
4120; AVX2-FCP-NEXT:    vmovdqa %ymm12, 160(%rax)
4121; AVX2-FCP-NEXT:    vmovdqa %ymm9, (%rax)
4122; AVX2-FCP-NEXT:    vmovdqa %ymm11, 192(%rax)
4123; AVX2-FCP-NEXT:    vmovdqa %ymm5, 32(%rax)
4124; AVX2-FCP-NEXT:    vmovdqa %ymm4, 64(%rax)
4125; AVX2-FCP-NEXT:    vzeroupper
4126; AVX2-FCP-NEXT:    retq
4127;
4128; AVX512-LABEL: store_i8_stride7_vf32:
4129; AVX512:       # %bb.0:
4130; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4131; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
4132; AVX512-NEXT:    vmovdqa (%rdi), %ymm3
4133; AVX512-NEXT:    vmovdqa (%rsi), %ymm4
4134; AVX512-NEXT:    vmovdqa (%rdx), %ymm5
4135; AVX512-NEXT:    vmovdqa (%rcx), %ymm6
4136; AVX512-NEXT:    vmovdqa (%r8), %ymm1
4137; AVX512-NEXT:    vmovdqa (%r9), %ymm2
4138; AVX512-NEXT:    vmovdqa (%r10), %ymm0
4139; AVX512-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20]
4140; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25]
4141; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
4142; AVX512-NEXT:    vpermq {{.*#+}} zmm7 = zmm7[2,3,2,3,6,7,6,7]
4143; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero
4144; AVX512-NEXT:    vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm4[23,u,u,u],zero,ymm4[26],zero,ymm4[24,u,u,u],zero,ymm4[27],zero
4145; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
4146; AVX512-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7]
4147; AVX512-NEXT:    vporq %zmm7, %zmm8, %zmm7
4148; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20],zero,zero
4149; AVX512-NEXT:    vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero
4150; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
4151; AVX512-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7]
4152; AVX512-NEXT:    vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm3[19],zero,ymm3[21,20,21,22],zero,ymm3[20],zero,ymm3[22,23]
4153; AVX512-NEXT:    vpshufb {{.*#+}} ymm10 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27]
4154; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
4155; AVX512-NEXT:    vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7]
4156; AVX512-NEXT:    vporq %zmm8, %zmm9, %zmm8
4157; AVX512-NEXT:    vpternlogd {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm7))
4158; AVX512-NEXT:    vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,u,u,u,u,26,27,24,25]
4159; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
4160; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm18
4161; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23]
4162; AVX512-NEXT:    vpermi2d %zmm7, %zmm9, %zmm10
4163; AVX512-NEXT:    vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22]
4164; AVX512-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
4165; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
4166; AVX512-NEXT:    vpermq {{.*#+}} zmm7 = zmm7[2,3,2,3,6,7,6,7]
4167; AVX512-NEXT:    vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm1[18],zero,ymm1[20,21,20,21],zero,ymm1[19],zero,ymm1[19,20,21,22],zero
4168; AVX512-NEXT:    vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31]
4169; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm19
4170; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm9, %zmm9
4171; AVX512-NEXT:    vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7]
4172; AVX512-NEXT:    vporq %zmm7, %zmm9, %zmm7
4173; AVX512-NEXT:    vpternlogd {{.*#+}} zmm7 = zmm10 ^ (mem & (zmm7 ^ zmm10))
4174; AVX512-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm8))
4175; AVX512-NEXT:    vmovdqa (%rsi), %xmm9
4176; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero
4177; AVX512-NEXT:    vmovdqa (%rdi), %xmm10
4178; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u,u,9]
4179; AVX512-NEXT:    vpor %xmm8, %xmm11, %xmm8
4180; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
4181; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
4182; AVX512-NEXT:    vinserti32x4 $2, %xmm8, %zmm11, %zmm8
4183; AVX512-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5]
4184; AVX512-NEXT:    vmovdqa (%rcx), %xmm14
4185; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u]
4186; AVX512-NEXT:    vmovdqa (%rdx), %xmm15
4187; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,7],zero,xmm15[5],zero,xmm15[u,u,u,8],zero,xmm15[6],zero,xmm15[u,u]
4188; AVX512-NEXT:    vpor %xmm11, %xmm12, %xmm11
4189; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
4190; AVX512-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
4191; AVX512-NEXT:    vinserti32x4 $2, %xmm11, %zmm12, %zmm11
4192; AVX512-NEXT:    vpermq {{.*#+}} zmm16 = zmm11[0,1,0,1,4,5,4,5]
4193; AVX512-NEXT:    vpternlogd {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm8))
4194; AVX512-NEXT:    vmovdqa (%r9), %xmm11
4195; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6]
4196; AVX512-NEXT:    vmovdqa (%r8), %xmm12
4197; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm12[4],zero,xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero
4198; AVX512-NEXT:    vpor %xmm8, %xmm13, %xmm8
4199; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
4200; AVX512-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
4201; AVX512-NEXT:    vinserti32x4 $2, %xmm8, %zmm13, %zmm8
4202; AVX512-NEXT:    vpermq {{.*#+}} zmm17 = zmm8[0,1,0,1,4,5,4,5]
4203; AVX512-NEXT:    vmovdqa (%r10), %xmm13
4204; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm13[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
4205; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm13[1,1,0,0,4,5,6,7]
4206; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
4207; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
4208; AVX512-NEXT:    vpermq {{.*#+}} zmm8 = zmm0[0,0,1,0,4,4,5,4]
4209; AVX512-NEXT:    vpternlogd {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm17))
4210; AVX512-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm16))
4211; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18]
4212; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero
4213; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
4214; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm15[8],xmm14[9],xmm15[9],xmm14[10],xmm15[10],xmm14[11],xmm15[11],xmm14[12],xmm15[12],xmm14[13],xmm15[13],xmm14[14],xmm15[14],xmm14[15],xmm15[15]
4215; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
4216; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
4217; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
4218; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u,u,u]
4219; AVX512-NEXT:    vpshufb {{.*#+}} ymm14 = ymm5[u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u,u,u]
4220; AVX512-NEXT:    vpor %ymm1, %ymm14, %ymm1
4221; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15]
4222; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
4223; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
4224; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm9, %zmm1
4225; AVX512-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0))
4226; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm14
4227; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm14[u],zero,zero,zero,zero,ymm14[14],zero,ymm14[u],zero,zero,zero,zero,ymm14[15],zero,ymm14[u],zero,zero,zero,zero,ymm14[16],zero,ymm14[u],zero,zero,zero,zero,ymm14[17],zero,ymm14[u],zero,zero
4228; AVX512-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u]
4229; AVX512-NEXT:    vpor %ymm0, %ymm9, %ymm0
4230; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
4231; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
4232; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
4233; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm9, %zmm0
4234; AVX512-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm13[0,1,2,3,4,5,5,6]
4235; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3]
4236; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
4237; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9
4238; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm11
4239; AVX512-NEXT:    vpshufb {{.*#+}} ymm10 = zero,ymm11[13,u,u,u,u],zero,zero,ymm11[14,u,u,u,u],zero,zero,ymm11[15,u,u,u,u],zero,zero,ymm11[16,u,u,u,u],zero,zero,ymm11[17,u,u]
4240; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
4241; AVX512-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm0 & mem)
4242; AVX512-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm1))
4243; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero
4244; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm6[30],zero,ymm6[28,u,u,u],zero,ymm6[31],zero,ymm6[29,u]
4245; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
4246; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
4247; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero,zero
4248; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29]
4249; AVX512-NEXT:    vpor %ymm1, %ymm3, %ymm1
4250; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
4251; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
4252; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[27],zero,zero,zero,zero,ymm14[30],zero,ymm14[28],zero,zero,zero,zero,ymm14[31],zero,ymm14[29]
4253; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27,u,u,u],zero,ymm2[30],zero,ymm2[28,u,u,u],zero,ymm2[31],zero
4254; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
4255; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
4256; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
4257; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
4258; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
4259; AVX512-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
4260; AVX512-NEXT:    vmovdqa %ymm2, 192(%rax)
4261; AVX512-NEXT:    vmovdqa64 %zmm8, (%rax)
4262; AVX512-NEXT:    vmovdqa64 %zmm7, 128(%rax)
4263; AVX512-NEXT:    vmovdqa64 %zmm9, 64(%rax)
4264; AVX512-NEXT:    vzeroupper
4265; AVX512-NEXT:    retq
4266;
4267; AVX512-FCP-LABEL: store_i8_stride7_vf32:
4268; AVX512-FCP:       # %bb.0:
4269; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4270; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
4271; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm3
4272; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm4
4273; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm5
4274; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm6
4275; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm1
4276; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm2
4277; AVX512-FCP-NEXT:    vmovdqa64 (%r10), %ymm17
4278; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm8
4279; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm8[u],zero,xmm8[7],zero,xmm8[5,u,u,u],zero,xmm8[8],zero,xmm8[6,u,u,u],zero
4280; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm9
4281; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u,u,9]
4282; AVX512-FCP-NEXT:    vpor %xmm7, %xmm10, %xmm7
4283; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
4284; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
4285; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm7, %zmm10, %zmm7
4286; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm7 = zmm7[0,1,0,1,4,5,4,5]
4287; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm11
4288; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm11[u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u]
4289; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm12
4290; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u]
4291; AVX512-FCP-NEXT:    vpor %xmm10, %xmm13, %xmm10
4292; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
4293; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
4294; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm10, %zmm13, %zmm10
4295; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm15 = zmm10[0,1,0,1,4,5,4,5]
4296; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm7))
4297; AVX512-FCP-NEXT:    vmovdqa (%r10), %xmm10
4298; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm10[1,1,0,0,4,5,6,7]
4299; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [0,1,0,1,0,0,0,0]
4300; AVX512-FCP-NEXT:    vpermd %ymm7, %ymm13, %ymm7
4301; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm10[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
4302; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0]
4303; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm7, %zmm16
4304; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm13
4305; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,xmm13[4,u,u,u],zero,xmm13[7],zero,xmm13[5,u,u,u],zero,xmm13[8],zero,xmm13[6]
4306; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm14
4307; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero
4308; AVX512-FCP-NEXT:    vpor %xmm7, %xmm0, %xmm0
4309; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
4310; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
4311; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm7, %zmm0
4312; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm7 = zmm0[0,1,0,1,4,5,4,5]
4313; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm7 = zmm16 ^ (mem & (zmm7 ^ zmm16))
4314; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm15))
4315; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18]
4316; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero
4317; AVX512-FCP-NEXT:    vpor %ymm0, %ymm15, %ymm0
4318; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15]
4319; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
4320; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1]
4321; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm11, %zmm0
4322; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u,u,u]
4323; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u,u,u]
4324; AVX512-FCP-NEXT:    vpor %ymm11, %ymm12, %ymm11
4325; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15]
4326; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
4327; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
4328; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm8, %zmm9
4329; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm0))
4330; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm1[u],zero,zero,zero,zero,ymm1[14],zero,ymm1[u],zero,zero,zero,zero,ymm1[15],zero,ymm1[u],zero,zero,zero,zero,ymm1[16],zero,ymm1[u],zero,zero,zero,zero,ymm1[17],zero,ymm1[u],zero,zero
4331; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u]
4332; AVX512-FCP-NEXT:    vpor %ymm0, %ymm8, %ymm0
4333; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
4334; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
4335; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
4336; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm0
4337; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,5,5,6]
4338; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3]
4339; AVX512-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
4340; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm10, %ymm8
4341; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm8
4342; AVX512-FCP-NEXT:    vmovdqa64 %ymm17, %ymm12
4343; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = zero,ymm12[13,u,u,u,u],zero,zero,ymm12[14,u,u,u,u],zero,zero,ymm12[15,u,u,u,u],zero,zero,ymm12[16,u,u,u,u],zero,zero,ymm12[17,u,u]
4344; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
4345; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm0 & mem)
4346; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm9))
4347; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20]
4348; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25]
4349; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm0
4350; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7]
4351; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero
4352; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm4[23,u,u,u],zero,ymm4[26],zero,ymm4[24,u,u,u],zero,ymm4[27],zero
4353; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
4354; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7]
4355; AVX512-FCP-NEXT:    vporq %zmm0, %zmm9, %zmm0
4356; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20],zero,zero
4357; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero
4358; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
4359; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7]
4360; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm3[19],zero,ymm3[21,20,21,22],zero,ymm3[20],zero,ymm3[22,23]
4361; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27]
4362; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm10, %zmm10
4363; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7]
4364; AVX512-FCP-NEXT:    vporq %zmm9, %zmm10, %zmm9
4365; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm0))
4366; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22]
4367; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
4368; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm0
4369; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7]
4370; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm1[18],zero,ymm1[20,21,20,21],zero,ymm1[19],zero,ymm1[19,20,21,22],zero
4371; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31]
4372; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm10, %zmm10
4373; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7]
4374; AVX512-FCP-NEXT:    vporq %zmm0, %zmm10, %zmm0
4375; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm12[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
4376; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,5,4,0,5,0,4,0]
4377; AVX512-FCP-NEXT:    vpermd %ymm10, %ymm11, %ymm10
4378; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
4379; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
4380; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm10, %zmm10
4381; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm0))
4382; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm9))
4383; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero
4384; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm6[30],zero,ymm6[28,u,u,u],zero,ymm6[31],zero,ymm6[29,u]
4385; AVX512-FCP-NEXT:    vpor %ymm0, %ymm5, %ymm0
4386; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
4387; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero,zero
4388; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29]
4389; AVX512-FCP-NEXT:    vpor %ymm4, %ymm3, %ymm3
4390; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
4391; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0))
4392; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29]
4393; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27,u,u,u],zero,ymm2[30],zero,ymm2[28,u,u,u],zero,ymm2[31],zero
4394; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
4395; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
4396; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
4397; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
4398; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
4399; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm3))
4400; AVX512-FCP-NEXT:    vmovdqa %ymm1, 192(%rax)
4401; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, (%rax)
4402; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, 128(%rax)
4403; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 64(%rax)
4404; AVX512-FCP-NEXT:    vzeroupper
4405; AVX512-FCP-NEXT:    retq
4406;
4407; AVX512DQ-LABEL: store_i8_stride7_vf32:
4408; AVX512DQ:       # %bb.0:
4409; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4410; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
4411; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm3
4412; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm4
4413; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm5
4414; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm6
4415; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm1
4416; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm2
4417; AVX512DQ-NEXT:    vmovdqa (%r10), %ymm0
4418; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20]
4419; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25]
4420; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
4421; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm7 = zmm7[2,3,2,3,6,7,6,7]
4422; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero
4423; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm4[23,u,u,u],zero,ymm4[26],zero,ymm4[24,u,u,u],zero,ymm4[27],zero
4424; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
4425; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7]
4426; AVX512DQ-NEXT:    vporq %zmm7, %zmm8, %zmm7
4427; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20],zero,zero
4428; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero
4429; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
4430; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7]
4431; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm3[19],zero,ymm3[21,20,21,22],zero,ymm3[20],zero,ymm3[22,23]
4432; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm10 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27]
4433; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
4434; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7]
4435; AVX512DQ-NEXT:    vporq %zmm8, %zmm9, %zmm8
4436; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm7))
4437; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,u,u,u,u,26,27,24,25]
4438; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
4439; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm18
4440; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23]
4441; AVX512DQ-NEXT:    vpermi2d %zmm7, %zmm9, %zmm10
4442; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22]
4443; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
4444; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm7, %zmm7
4445; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm7 = zmm7[2,3,2,3,6,7,6,7]
4446; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm1[18],zero,ymm1[20,21,20,21],zero,ymm1[19],zero,ymm1[19,20,21,22],zero
4447; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31]
4448; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm19
4449; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm9, %zmm9
4450; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7]
4451; AVX512DQ-NEXT:    vporq %zmm7, %zmm9, %zmm7
4452; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm7 = zmm10 ^ (mem & (zmm7 ^ zmm10))
4453; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm8))
4454; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm9
4455; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero
4456; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm10
4457; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u,u,9]
4458; AVX512DQ-NEXT:    vpor %xmm8, %xmm11, %xmm8
4459; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
4460; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
4461; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm8, %zmm11, %zmm8
4462; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5]
4463; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm14
4464; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u]
4465; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm15
4466; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,7],zero,xmm15[5],zero,xmm15[u,u,u,8],zero,xmm15[6],zero,xmm15[u,u]
4467; AVX512DQ-NEXT:    vpor %xmm11, %xmm12, %xmm11
4468; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
4469; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
4470; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm11, %zmm12, %zmm11
4471; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm16 = zmm11[0,1,0,1,4,5,4,5]
4472; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm8))
4473; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm11
4474; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6]
4475; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm12
4476; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = xmm12[4],zero,xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero
4477; AVX512DQ-NEXT:    vpor %xmm8, %xmm13, %xmm8
4478; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
4479; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
4480; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm8, %zmm13, %zmm8
4481; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm17 = zmm8[0,1,0,1,4,5,4,5]
4482; AVX512DQ-NEXT:    vmovdqa (%r10), %xmm13
4483; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm13[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
4484; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm13[1,1,0,0,4,5,6,7]
4485; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
4486; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
4487; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm8 = zmm0[0,0,1,0,4,4,5,4]
4488; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm17))
4489; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm16))
4490; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18]
4491; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero
4492; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
4493; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm15[8],xmm14[9],xmm15[9],xmm14[10],xmm15[10],xmm14[11],xmm15[11],xmm14[12],xmm15[12],xmm14[13],xmm15[13],xmm14[14],xmm15[14],xmm14[15],xmm15[15]
4494; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
4495; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
4496; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
4497; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u,u,u]
4498; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm14 = ymm5[u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u,u,u]
4499; AVX512DQ-NEXT:    vpor %ymm1, %ymm14, %ymm1
4500; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15]
4501; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
4502; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
4503; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm9, %zmm1
4504; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0))
4505; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm14
4506; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm14[u],zero,zero,zero,zero,ymm14[14],zero,ymm14[u],zero,zero,zero,zero,ymm14[15],zero,ymm14[u],zero,zero,zero,zero,ymm14[16],zero,ymm14[u],zero,zero,zero,zero,ymm14[17],zero,ymm14[u],zero,zero
4507; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u]
4508; AVX512DQ-NEXT:    vpor %ymm0, %ymm9, %ymm0
4509; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
4510; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
4511; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
4512; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm9, %zmm0
4513; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm13[0,1,2,3,4,5,5,6]
4514; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3]
4515; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
4516; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9
4517; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm11
4518; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm10 = zero,ymm11[13,u,u,u,u],zero,zero,ymm11[14,u,u,u,u],zero,zero,ymm11[15,u,u,u,u],zero,zero,ymm11[16,u,u,u,u],zero,zero,ymm11[17,u,u]
4519; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
4520; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm0 & mem)
4521; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm1))
4522; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero
4523; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm6[30],zero,ymm6[28,u,u,u],zero,ymm6[31],zero,ymm6[29,u]
4524; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
4525; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
4526; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero,zero
4527; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29]
4528; AVX512DQ-NEXT:    vpor %ymm1, %ymm3, %ymm1
4529; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
4530; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
4531; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[27],zero,zero,zero,zero,ymm14[30],zero,ymm14[28],zero,zero,zero,zero,ymm14[31],zero,ymm14[29]
4532; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27,u,u,u],zero,ymm2[30],zero,ymm2[28,u,u,u],zero,ymm2[31],zero
4533; AVX512DQ-NEXT:    vpor %ymm0, %ymm2, %ymm0
4534; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
4535; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
4536; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
4537; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0))
4538; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1))
4539; AVX512DQ-NEXT:    vmovdqa %ymm2, 192(%rax)
4540; AVX512DQ-NEXT:    vmovdqa64 %zmm8, (%rax)
4541; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 128(%rax)
4542; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 64(%rax)
4543; AVX512DQ-NEXT:    vzeroupper
4544; AVX512DQ-NEXT:    retq
4545;
4546; AVX512DQ-FCP-LABEL: store_i8_stride7_vf32:
4547; AVX512DQ-FCP:       # %bb.0:
4548; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4549; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
4550; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm3
4551; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm4
4552; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm5
4553; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm6
4554; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm1
4555; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm2
4556; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r10), %ymm17
4557; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm8
4558; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm8[u],zero,xmm8[7],zero,xmm8[5,u,u,u],zero,xmm8[8],zero,xmm8[6,u,u,u],zero
4559; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm9
4560; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u,u,9]
4561; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm10, %xmm7
4562; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
4563; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
4564; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm7, %zmm10, %zmm7
4565; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm7 = zmm7[0,1,0,1,4,5,4,5]
4566; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm11
4567; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm11[u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u]
4568; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm12
4569; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u]
4570; AVX512DQ-FCP-NEXT:    vpor %xmm10, %xmm13, %xmm10
4571; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
4572; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
4573; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm10, %zmm13, %zmm10
4574; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm15 = zmm10[0,1,0,1,4,5,4,5]
4575; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm7))
4576; AVX512DQ-FCP-NEXT:    vmovdqa (%r10), %xmm10
4577; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm10[1,1,0,0,4,5,6,7]
4578; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [0,1,0,1,0,0,0,0]
4579; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm13, %ymm7
4580; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm10[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
4581; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0]
4582; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm7, %zmm16
4583; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm13
4584; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,xmm13[4,u,u,u],zero,xmm13[7],zero,xmm13[5,u,u,u],zero,xmm13[8],zero,xmm13[6]
4585; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm14
4586; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero
4587; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm0, %xmm0
4588; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
4589; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
4590; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm7, %zmm0
4591; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm7 = zmm0[0,1,0,1,4,5,4,5]
4592; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm7 = zmm16 ^ (mem & (zmm7 ^ zmm16))
4593; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm15))
4594; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18]
4595; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero
4596; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm15, %ymm0
4597; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15]
4598; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
4599; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1]
4600; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm11, %zmm0
4601; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u,u,u]
4602; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u,u,u]
4603; AVX512DQ-FCP-NEXT:    vpor %ymm11, %ymm12, %ymm11
4604; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15]
4605; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
4606; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
4607; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm8, %zmm9
4608; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm0))
4609; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,ymm1[u],zero,zero,zero,zero,ymm1[14],zero,ymm1[u],zero,zero,zero,zero,ymm1[15],zero,ymm1[u],zero,zero,zero,zero,ymm1[16],zero,ymm1[u],zero,zero,zero,zero,ymm1[17],zero,ymm1[u],zero,zero
4610; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u]
4611; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm8, %ymm0
4612; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
4613; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
4614; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
4615; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm0
4616; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,5,5,6]
4617; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3]
4618; AVX512DQ-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
4619; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm10, %ymm8
4620; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm8
4621; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm17, %ymm12
4622; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = zero,ymm12[13,u,u,u,u],zero,zero,ymm12[14,u,u,u,u],zero,zero,ymm12[15,u,u,u,u],zero,zero,ymm12[16,u,u,u,u],zero,zero,ymm12[17,u,u]
4623; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
4624; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm0 & mem)
4625; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm9))
4626; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20]
4627; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25]
4628; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm0
4629; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7]
4630; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero
4631; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm4[23,u,u,u],zero,ymm4[26],zero,ymm4[24,u,u,u],zero,ymm4[27],zero
4632; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
4633; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7]
4634; AVX512DQ-FCP-NEXT:    vporq %zmm0, %zmm9, %zmm0
4635; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20],zero,zero
4636; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero
4637; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
4638; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7]
4639; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm3[19],zero,ymm3[21,20,21,22],zero,ymm3[20],zero,ymm3[22,23]
4640; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27]
4641; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm10, %zmm10
4642; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7]
4643; AVX512DQ-FCP-NEXT:    vporq %zmm9, %zmm10, %zmm9
4644; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm0))
4645; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22]
4646; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
4647; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm0
4648; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7]
4649; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm1[18],zero,ymm1[20,21,20,21],zero,ymm1[19],zero,ymm1[19,20,21,22],zero
4650; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31]
4651; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm10, %zmm10
4652; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7]
4653; AVX512DQ-FCP-NEXT:    vporq %zmm0, %zmm10, %zmm0
4654; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm12[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
4655; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,5,4,0,5,0,4,0]
4656; AVX512DQ-FCP-NEXT:    vpermd %ymm10, %ymm11, %ymm10
4657; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
4658; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
4659; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm10, %zmm10
4660; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm0))
4661; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm9))
4662; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero
4663; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm6[30],zero,ymm6[28,u,u,u],zero,ymm6[31],zero,ymm6[29,u]
4664; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm5, %ymm0
4665; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
4666; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero,zero
4667; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29]
4668; AVX512DQ-FCP-NEXT:    vpor %ymm4, %ymm3, %ymm3
4669; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
4670; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0))
4671; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29]
4672; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27,u,u,u],zero,ymm2[30],zero,ymm2[28,u,u,u],zero,ymm2[31],zero
4673; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
4674; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
4675; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
4676; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
4677; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
4678; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm3))
4679; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, 192(%rax)
4680; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, (%rax)
4681; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, 128(%rax)
4682; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 64(%rax)
4683; AVX512DQ-FCP-NEXT:    vzeroupper
4684; AVX512DQ-FCP-NEXT:    retq
4685;
4686; AVX512BW-LABEL: store_i8_stride7_vf32:
4687; AVX512BW:       # %bb.0:
4688; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4689; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
4690; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm4
4691; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm2
4692; AVX512BW-NEXT:    vmovdqa (%rdx), %ymm1
4693; AVX512BW-NEXT:    vmovdqa (%rcx), %ymm3
4694; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,1,0,1,14],zero,ymm1[14,15,0,1,14,15],zero,ymm1[13,14,15,16,17,16],zero,ymm1[30,31,30,31,16,17],zero,ymm1[31,28,29,30,31]
4695; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero
4696; AVX512BW-NEXT:    vpor %ymm0, %ymm5, %ymm0
4697; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm8
4698; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm10
4699; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
4700; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
4701; AVX512BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
4702; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm5, %zmm5
4703; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[0,1,14],zero,ymm4[12,13,0,1,14,15],zero,ymm4[3,12,13,2,3,16],zero,ymm4[30,31,28,29,16,17],zero,ymm4[31,18,19,28,29,18],zero
4704; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero,zero,ymm2[18]
4705; AVX512BW-NEXT:    vpor %ymm0, %ymm6, %ymm0
4706; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm12
4707; AVX512BW-NEXT:    vmovdqa (%rcx), %xmm14
4708; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15]
4709; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
4710; AVX512BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
4711; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm6, %zmm0
4712; AVX512BW-NEXT:    movabsq $435749858791416001, %rcx # imm = 0x60C1830183060C1
4713; AVX512BW-NEXT:    kmovq %rcx, %k1
4714; AVX512BW-NEXT:    vmovdqu8 %zmm5, %zmm0 {%k1}
4715; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3]
4716; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm7 = ymm5[13],zero,zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero
4717; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = mem[0,1,2,3,0,1,2,3]
4718; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero
4719; AVX512BW-NEXT:    vpor %ymm7, %ymm9, %ymm7
4720; AVX512BW-NEXT:    vmovdqa (%r9), %xmm11
4721; AVX512BW-NEXT:    vmovdqa (%r8), %xmm13
4722; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15]
4723; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
4724; AVX512BW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
4725; AVX512BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm9, %zmm9
4726; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3]
4727; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
4728; AVX512BW-NEXT:    # ymm15 = mem[0,1,0,1]
4729; AVX512BW-NEXT:    vpermw %ymm7, %ymm15, %ymm15
4730; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm16 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
4731; AVX512BW-NEXT:    vinserti64x4 $1, %ymm16, %zmm15, %zmm15
4732; AVX512BW-NEXT:    movabsq $2323999253380730912, %rcx # imm = 0x2040810204081020
4733; AVX512BW-NEXT:    kmovq %rcx, %k1
4734; AVX512BW-NEXT:    vmovdqu8 %zmm15, %zmm9 {%k1}
4735; AVX512BW-NEXT:    movabsq $4066998693416279096, %rcx # imm = 0x3870E1C3870E1C38
4736; AVX512BW-NEXT:    kmovq %rcx, %k1
4737; AVX512BW-NEXT:    vmovdqu8 %zmm9, %zmm0 {%k1}
4738; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm9
4739; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm9 = zmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm9[18,19,20,21],zero,zmm9[19],zero,zmm9[25,26,27,22],zero,zmm9[20],zero,zmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm9[55],zero,zero,zero,zero,zmm9[58],zero,zmm9[56],zero,zero,zero,zero,zmm9[59],zero
4740; AVX512BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm15
4741; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm15 = zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm15[18],zero,zero,zero,zero,zmm15[21],zero,zmm15[19],zero,zero,zero,zero,zmm15[22],zero,zmm15[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm15[55],zero,zero,zero,zero,zmm15[58],zero,zmm15[56],zero,zero,zero,zero,zmm15[59],zero,zmm15[57]
4742; AVX512BW-NEXT:    vporq %zmm9, %zmm15, %zmm9
4743; AVX512BW-NEXT:    vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7]
4744; AVX512BW-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
4745; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[0,0,1,1,4,4,5,5]
4746; AVX512BW-NEXT:    movl $676341840, %ecx # imm = 0x28502850
4747; AVX512BW-NEXT:    kmovd %ecx, %k1
4748; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm15 {%k1} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u]
4749; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm16 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm1[23],zero,ymm1[21,22,23,26],zero,ymm1[24],zero,ymm1[28,29,26,27]
4750; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm17 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero
4751; AVX512BW-NEXT:    vporq %ymm16, %ymm17, %ymm16
4752; AVX512BW-NEXT:    vinserti64x4 $1, %ymm16, %zmm15, %zmm15
4753; AVX512BW-NEXT:    vpermq {{.*#+}} zmm15 = zmm15[2,3,2,3,6,7,6,7]
4754; AVX512BW-NEXT:    movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060
4755; AVX512BW-NEXT:    kmovq %rcx, %k2
4756; AVX512BW-NEXT:    vmovdqu8 %zmm15, %zmm9 {%k2}
4757; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28]
4758; AVX512BW-NEXT:    vpermw %zmm7, %zmm15, %zmm15
4759; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm16 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm6[18],zero,zmm6[20,21,20,21],zero,zmm6[19],zero,zmm6[19,20,21,22],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm6[55],zero,zmm6[55,56,57,58],zero,zmm6[56],zero,zmm6[62,63]
4760; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm17 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[20],zero,zmm5[18],zero,zero,zero,zero,zmm5[21],zero,zmm5[19],zero,zero,zero,zero,zmm5[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm5[57],zero,zmm5[55],zero,zero,zero,zero,zmm5[58],zero,zmm5[56],zero,zero
4761; AVX512BW-NEXT:    vporq %zmm16, %zmm17, %zmm16
4762; AVX512BW-NEXT:    vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7]
4763; AVX512BW-NEXT:    movabsq $1161999626690365456, %rcx # imm = 0x1020408102040810
4764; AVX512BW-NEXT:    kmovq %rcx, %k2
4765; AVX512BW-NEXT:    vmovdqu8 %zmm15, %zmm16 {%k2}
4766; AVX512BW-NEXT:    movabsq $2033499346708139548, %rcx # imm = 0x1C3870E1C3870E1C
4767; AVX512BW-NEXT:    kmovq %rcx, %k2
4768; AVX512BW-NEXT:    vmovdqu8 %zmm16, %zmm9 {%k2}
4769; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u]
4770; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm16 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u]
4771; AVX512BW-NEXT:    vporq %xmm15, %xmm16, %xmm15
4772; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
4773; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
4774; AVX512BW-NEXT:    vinserti32x4 $2, %xmm15, %zmm12, %zmm12
4775; AVX512BW-NEXT:    vpermq {{.*#+}} zmm12 = zmm12[0,1,0,1,4,5,4,5]
4776; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm10[u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6,u,u,u],zero
4777; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u,u,9]
4778; AVX512BW-NEXT:    vpor %xmm14, %xmm15, %xmm14
4779; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7]
4780; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
4781; AVX512BW-NEXT:    vinserti32x4 $2, %xmm14, %zmm8, %zmm8
4782; AVX512BW-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5]
4783; AVX512BW-NEXT:    movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C
4784; AVX512BW-NEXT:    kmovq %rcx, %k2
4785; AVX512BW-NEXT:    vmovdqu8 %zmm12, %zmm8 {%k2}
4786; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6]
4787; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero
4788; AVX512BW-NEXT:    vpor %xmm10, %xmm12, %xmm10
4789; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7]
4790; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
4791; AVX512BW-NEXT:    vinserti32x4 $2, %xmm10, %zmm11, %zmm10
4792; AVX512BW-NEXT:    vpermq {{.*#+}} zmm10 = zmm10[0,1,0,1,4,5,4,5]
4793; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20]
4794; AVX512BW-NEXT:    vpermw %zmm7, %zmm11, %zmm11
4795; AVX512BW-NEXT:    movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040
4796; AVX512BW-NEXT:    kmovq %rcx, %k2
4797; AVX512BW-NEXT:    vmovdqu8 %zmm11, %zmm10 {%k2}
4798; AVX512BW-NEXT:    movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870
4799; AVX512BW-NEXT:    kmovq %rcx, %k2
4800; AVX512BW-NEXT:    vmovdqu8 %zmm10, %zmm8 {%k2}
4801; AVX512BW-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
4802; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7]
4803; AVX512BW-NEXT:    movl $338170920, %ecx # imm = 0x14281428
4804; AVX512BW-NEXT:    kmovd %ecx, %k2
4805; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm4 {%k2} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u]
4806; AVX512BW-NEXT:    vpermq {{.*#+}} ymm2 = ymm4[2,3,2,3]
4807; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u]
4808; AVX512BW-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
4809; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,3,3,4,6,7,7]
4810; AVX512BW-NEXT:    vmovdqu8 %ymm1, %ymm3 {%k1}
4811; AVX512BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm3[2,3,2,3]
4812; AVX512BW-NEXT:    movl $101455920, %ecx # imm = 0x60C1830
4813; AVX512BW-NEXT:    kmovd %ecx, %k1
4814; AVX512BW-NEXT:    vmovdqu8 %ymm2, %ymm1 {%k1}
4815; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15]
4816; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
4817; AVX512BW-NEXT:    vpermw %ymm7, %ymm2, %ymm2
4818; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero
4819; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29]
4820; AVX512BW-NEXT:    vpor %ymm3, %ymm4, %ymm3
4821; AVX512BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
4822; AVX512BW-NEXT:    movl $-2130574328, %ecx # imm = 0x81020408
4823; AVX512BW-NEXT:    kmovd %ecx, %k1
4824; AVX512BW-NEXT:    vmovdqu8 %ymm2, %ymm3 {%k1}
4825; AVX512BW-NEXT:    movl $-507279602, %ecx # imm = 0xE1C3870E
4826; AVX512BW-NEXT:    kmovd %ecx, %k1
4827; AVX512BW-NEXT:    vmovdqu8 %ymm3, %ymm1 {%k1}
4828; AVX512BW-NEXT:    vmovdqa %ymm1, 192(%rax)
4829; AVX512BW-NEXT:    vmovdqa64 %zmm8, (%rax)
4830; AVX512BW-NEXT:    vmovdqa64 %zmm9, 128(%rax)
4831; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
4832; AVX512BW-NEXT:    vzeroupper
4833; AVX512BW-NEXT:    retq
4834;
4835; AVX512BW-FCP-LABEL: store_i8_stride7_vf32:
4836; AVX512BW-FCP:       # %bb.0:
4837; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4838; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
4839; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm3
4840; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %ymm4
4841; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %ymm1
4842; AVX512BW-FCP-NEXT:    vmovdqa (%rcx), %ymm2
4843; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,1,0,1,14],zero,ymm1[14,15,0,1,14,15],zero,ymm1[13,14,15,16,17,16],zero,ymm1[30,31,30,31,16,17],zero,ymm1[31,28,29,30,31]
4844; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero
4845; AVX512BW-FCP-NEXT:    vpor %ymm0, %ymm5, %ymm0
4846; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm8
4847; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %xmm9
4848; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15]
4849; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
4850; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
4851; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm5, %zmm5
4852; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[0,1,14],zero,ymm3[12,13,0,1,14,15],zero,ymm3[3,12,13,2,3,16],zero,ymm3[30,31,28,29,16,17],zero,ymm3[31,18,19,28,29,18],zero
4853; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero,zero,ymm4[18]
4854; AVX512BW-FCP-NEXT:    vpor %ymm0, %ymm6, %ymm0
4855; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %xmm12
4856; AVX512BW-FCP-NEXT:    vmovdqa (%rcx), %xmm14
4857; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15]
4858; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
4859; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
4860; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm6, %zmm0
4861; AVX512BW-FCP-NEXT:    movabsq $435749858791416001, %rcx # imm = 0x60C1830183060C1
4862; AVX512BW-FCP-NEXT:    kmovq %rcx, %k1
4863; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm5, %zmm0 {%k1}
4864; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3]
4865; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm5[13],zero,zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero
4866; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = mem[0,1,2,3,0,1,2,3]
4867; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero
4868; AVX512BW-FCP-NEXT:    vpor %ymm7, %ymm10, %ymm7
4869; AVX512BW-FCP-NEXT:    vmovdqa (%r9), %xmm11
4870; AVX512BW-FCP-NEXT:    vmovdqa (%r8), %xmm13
4871; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15]
4872; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
4873; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1]
4874; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm10, %zmm10
4875; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3]
4876; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
4877; AVX512BW-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
4878; AVX512BW-FCP-NEXT:    vpermw %ymm7, %ymm15, %ymm15
4879; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm16 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
4880; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm15, %zmm15
4881; AVX512BW-FCP-NEXT:    movabsq $2323999253380730912, %rcx # imm = 0x2040810204081020
4882; AVX512BW-FCP-NEXT:    kmovq %rcx, %k1
4883; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm15, %zmm10 {%k1}
4884; AVX512BW-FCP-NEXT:    movabsq $4066998693416279096, %rcx # imm = 0x3870E1C3870E1C38
4885; AVX512BW-FCP-NEXT:    kmovq %rcx, %k1
4886; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm10, %zmm0 {%k1}
4887; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm10
4888; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,zmm10[19],zero,zmm10[21,20,21,22],zero,zmm10[20],zero,zmm10[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm10[55],zero,zmm10[53,54,55,58],zero,zmm10[56],zero,zmm10[60,61,58,59]
4889; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm4, %zmm15
4890; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm15 = zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm15[21],zero,zmm15[19],zero,zero,zero,zero,zmm15[22],zero,zmm15[20],zero,zero,zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm15[57],zero,zmm15[55],zero,zero,zero,zero,zmm15[58],zero,zmm15[56],zero,zero,zero,zero
4891; AVX512BW-FCP-NEXT:    vporq %zmm10, %zmm15, %zmm10
4892; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm15 = zmm10[2,3,2,3,6,7,6,7]
4893; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm10
4894; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm10[18,19,20,21],zero,zmm10[19],zero,zmm10[25,26,27,22],zero,zmm10[20],zero,zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm10[55],zero,zero,zero,zero,zmm10[58],zero,zmm10[56],zero,zero,zero,zero,zmm10[59],zero
4895; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm16
4896; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm16 = zmm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[18],zero,zero,zero,zero,zmm16[21],zero,zmm16[19],zero,zero,zero,zero,zmm16[22],zero,zmm16[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[55],zero,zero,zero,zero,zmm16[58],zero,zmm16[56],zero,zero,zero,zero,zmm16[59],zero,zmm16[57]
4897; AVX512BW-FCP-NEXT:    vporq %zmm10, %zmm16, %zmm10
4898; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7]
4899; AVX512BW-FCP-NEXT:    movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060
4900; AVX512BW-FCP-NEXT:    kmovq %rcx, %k1
4901; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm15, %zmm10 {%k1}
4902; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28]
4903; AVX512BW-FCP-NEXT:    vpermw %zmm7, %zmm15, %zmm15
4904; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm16 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm6[18],zero,zmm6[20,21,20,21],zero,zmm6[19],zero,zmm6[19,20,21,22],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm6[55],zero,zmm6[55,56,57,58],zero,zmm6[56],zero,zmm6[62,63]
4905; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm17 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[20],zero,zmm5[18],zero,zero,zero,zero,zmm5[21],zero,zmm5[19],zero,zero,zero,zero,zmm5[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm5[57],zero,zmm5[55],zero,zero,zero,zero,zmm5[58],zero,zmm5[56],zero,zero
4906; AVX512BW-FCP-NEXT:    vporq %zmm16, %zmm17, %zmm16
4907; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7]
4908; AVX512BW-FCP-NEXT:    movabsq $1161999626690365456, %rcx # imm = 0x1020408102040810
4909; AVX512BW-FCP-NEXT:    kmovq %rcx, %k1
4910; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm15, %zmm16 {%k1}
4911; AVX512BW-FCP-NEXT:    movabsq $2033499346708139548, %rcx # imm = 0x1C3870E1C3870E1C
4912; AVX512BW-FCP-NEXT:    kmovq %rcx, %k1
4913; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm16, %zmm10 {%k1}
4914; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u]
4915; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u]
4916; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm16, %xmm15
4917; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
4918; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
4919; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm12, %zmm12
4920; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm12 = zmm12[0,1,0,1,4,5,4,5]
4921; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero
4922; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u,u,9]
4923; AVX512BW-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
4924; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
4925; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
4926; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm14, %zmm8, %zmm8
4927; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5]
4928; AVX512BW-FCP-NEXT:    movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C
4929; AVX512BW-FCP-NEXT:    kmovq %rcx, %k1
4930; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm12, %zmm8 {%k1}
4931; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6]
4932; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero
4933; AVX512BW-FCP-NEXT:    vpor %xmm9, %xmm12, %xmm9
4934; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7]
4935; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
4936; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm9, %zmm11, %zmm9
4937; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm9 = zmm9[0,1,0,1,4,5,4,5]
4938; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20]
4939; AVX512BW-FCP-NEXT:    vpermw %zmm7, %zmm11, %zmm11
4940; AVX512BW-FCP-NEXT:    movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040
4941; AVX512BW-FCP-NEXT:    kmovq %rcx, %k1
4942; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm11, %zmm9 {%k1}
4943; AVX512BW-FCP-NEXT:    movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870
4944; AVX512BW-FCP-NEXT:    kmovq %rcx, %k1
4945; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm9, %zmm8 {%k1}
4946; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29]
4947; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero,zero
4948; AVX512BW-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
4949; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
4950; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero
4951; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero
4952; AVX512BW-FCP-NEXT:    vpor %ymm2, %ymm1, %ymm1
4953; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
4954; AVX512BW-FCP-NEXT:    movl $101455920, %ecx # imm = 0x60C1830
4955; AVX512BW-FCP-NEXT:    kmovd %ecx, %k1
4956; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm3, %ymm1 {%k1}
4957; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15]
4958; AVX512BW-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
4959; AVX512BW-FCP-NEXT:    vpermw %ymm7, %ymm2, %ymm2
4960; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero
4961; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29]
4962; AVX512BW-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
4963; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
4964; AVX512BW-FCP-NEXT:    movl $-2130574328, %ecx # imm = 0x81020408
4965; AVX512BW-FCP-NEXT:    kmovd %ecx, %k1
4966; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm2, %ymm3 {%k1}
4967; AVX512BW-FCP-NEXT:    movl $-507279602, %ecx # imm = 0xE1C3870E
4968; AVX512BW-FCP-NEXT:    kmovd %ecx, %k1
4969; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm3, %ymm1 {%k1}
4970; AVX512BW-FCP-NEXT:    vmovdqa %ymm1, 192(%rax)
4971; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, (%rax)
4972; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, 128(%rax)
4973; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
4974; AVX512BW-FCP-NEXT:    vzeroupper
4975; AVX512BW-FCP-NEXT:    retq
4976;
4977; AVX512DQ-BW-LABEL: store_i8_stride7_vf32:
4978; AVX512DQ-BW:       # %bb.0:
4979; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4980; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
4981; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm4
4982; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %ymm2
4983; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %ymm1
4984; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %ymm3
4985; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,1,0,1,14],zero,ymm1[14,15,0,1,14,15],zero,ymm1[13,14,15,16,17,16],zero,ymm1[30,31,30,31,16,17],zero,ymm1[31,28,29,30,31]
4986; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero
4987; AVX512DQ-BW-NEXT:    vpor %ymm0, %ymm5, %ymm0
4988; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm8
4989; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %xmm10
4990; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
4991; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
4992; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
4993; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm5, %zmm5
4994; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[0,1,14],zero,ymm4[12,13,0,1,14,15],zero,ymm4[3,12,13,2,3,16],zero,ymm4[30,31,28,29,16,17],zero,ymm4[31,18,19,28,29,18],zero
4995; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero,zero,ymm2[18]
4996; AVX512DQ-BW-NEXT:    vpor %ymm0, %ymm6, %ymm0
4997; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm12
4998; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %xmm14
4999; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15]
5000; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
5001; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
5002; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm6, %zmm0
5003; AVX512DQ-BW-NEXT:    movabsq $435749858791416001, %rcx # imm = 0x60C1830183060C1
5004; AVX512DQ-BW-NEXT:    kmovq %rcx, %k1
5005; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm5, %zmm0 {%k1}
5006; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3]
5007; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm7 = ymm5[13],zero,zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero
5008; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = mem[0,1,2,3,0,1,2,3]
5009; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero
5010; AVX512DQ-BW-NEXT:    vpor %ymm7, %ymm9, %ymm7
5011; AVX512DQ-BW-NEXT:    vmovdqa (%r9), %xmm11
5012; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %xmm13
5013; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15]
5014; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
5015; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
5016; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm9, %zmm9
5017; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3]
5018; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
5019; AVX512DQ-BW-NEXT:    # ymm15 = mem[0,1,0,1]
5020; AVX512DQ-BW-NEXT:    vpermw %ymm7, %ymm15, %ymm15
5021; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm16 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
5022; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm16, %zmm15, %zmm15
5023; AVX512DQ-BW-NEXT:    movabsq $2323999253380730912, %rcx # imm = 0x2040810204081020
5024; AVX512DQ-BW-NEXT:    kmovq %rcx, %k1
5025; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm15, %zmm9 {%k1}
5026; AVX512DQ-BW-NEXT:    movabsq $4066998693416279096, %rcx # imm = 0x3870E1C3870E1C38
5027; AVX512DQ-BW-NEXT:    kmovq %rcx, %k1
5028; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm9, %zmm0 {%k1}
5029; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm9
5030; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm9 = zmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm9[18,19,20,21],zero,zmm9[19],zero,zmm9[25,26,27,22],zero,zmm9[20],zero,zmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm9[55],zero,zero,zero,zero,zmm9[58],zero,zmm9[56],zero,zero,zero,zero,zmm9[59],zero
5031; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm15
5032; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm15 = zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm15[18],zero,zero,zero,zero,zmm15[21],zero,zmm15[19],zero,zero,zero,zero,zmm15[22],zero,zmm15[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm15[55],zero,zero,zero,zero,zmm15[58],zero,zmm15[56],zero,zero,zero,zero,zmm15[59],zero,zmm15[57]
5033; AVX512DQ-BW-NEXT:    vporq %zmm9, %zmm15, %zmm9
5034; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7]
5035; AVX512DQ-BW-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
5036; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[0,0,1,1,4,4,5,5]
5037; AVX512DQ-BW-NEXT:    movl $676341840, %ecx # imm = 0x28502850
5038; AVX512DQ-BW-NEXT:    kmovd %ecx, %k1
5039; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm15 {%k1} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u]
5040; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm16 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm1[23],zero,ymm1[21,22,23,26],zero,ymm1[24],zero,ymm1[28,29,26,27]
5041; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm17 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero
5042; AVX512DQ-BW-NEXT:    vporq %ymm16, %ymm17, %ymm16
5043; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm16, %zmm15, %zmm15
5044; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm15 = zmm15[2,3,2,3,6,7,6,7]
5045; AVX512DQ-BW-NEXT:    movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060
5046; AVX512DQ-BW-NEXT:    kmovq %rcx, %k2
5047; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm15, %zmm9 {%k2}
5048; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28]
5049; AVX512DQ-BW-NEXT:    vpermw %zmm7, %zmm15, %zmm15
5050; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm16 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm6[18],zero,zmm6[20,21,20,21],zero,zmm6[19],zero,zmm6[19,20,21,22],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm6[55],zero,zmm6[55,56,57,58],zero,zmm6[56],zero,zmm6[62,63]
5051; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm17 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[20],zero,zmm5[18],zero,zero,zero,zero,zmm5[21],zero,zmm5[19],zero,zero,zero,zero,zmm5[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm5[57],zero,zmm5[55],zero,zero,zero,zero,zmm5[58],zero,zmm5[56],zero,zero
5052; AVX512DQ-BW-NEXT:    vporq %zmm16, %zmm17, %zmm16
5053; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7]
5054; AVX512DQ-BW-NEXT:    movabsq $1161999626690365456, %rcx # imm = 0x1020408102040810
5055; AVX512DQ-BW-NEXT:    kmovq %rcx, %k2
5056; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm15, %zmm16 {%k2}
5057; AVX512DQ-BW-NEXT:    movabsq $2033499346708139548, %rcx # imm = 0x1C3870E1C3870E1C
5058; AVX512DQ-BW-NEXT:    kmovq %rcx, %k2
5059; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm16, %zmm9 {%k2}
5060; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u]
5061; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm16 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u]
5062; AVX512DQ-BW-NEXT:    vporq %xmm15, %xmm16, %xmm15
5063; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
5064; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
5065; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm15, %zmm12, %zmm12
5066; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm12 = zmm12[0,1,0,1,4,5,4,5]
5067; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm14 = xmm10[u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6,u,u,u],zero
5068; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u,u,9]
5069; AVX512DQ-BW-NEXT:    vpor %xmm14, %xmm15, %xmm14
5070; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7]
5071; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
5072; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm14, %zmm8, %zmm8
5073; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5]
5074; AVX512DQ-BW-NEXT:    movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C
5075; AVX512DQ-BW-NEXT:    kmovq %rcx, %k2
5076; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm12, %zmm8 {%k2}
5077; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm10 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6]
5078; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero
5079; AVX512DQ-BW-NEXT:    vpor %xmm10, %xmm12, %xmm10
5080; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7]
5081; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
5082; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm10, %zmm11, %zmm10
5083; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm10 = zmm10[0,1,0,1,4,5,4,5]
5084; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20]
5085; AVX512DQ-BW-NEXT:    vpermw %zmm7, %zmm11, %zmm11
5086; AVX512DQ-BW-NEXT:    movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040
5087; AVX512DQ-BW-NEXT:    kmovq %rcx, %k2
5088; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm11, %zmm10 {%k2}
5089; AVX512DQ-BW-NEXT:    movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870
5090; AVX512DQ-BW-NEXT:    kmovq %rcx, %k2
5091; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm10, %zmm8 {%k2}
5092; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
5093; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7]
5094; AVX512DQ-BW-NEXT:    movl $338170920, %ecx # imm = 0x14281428
5095; AVX512DQ-BW-NEXT:    kmovd %ecx, %k2
5096; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm4 {%k2} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u]
5097; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm2 = ymm4[2,3,2,3]
5098; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u]
5099; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
5100; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,3,3,4,6,7,7]
5101; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm1, %ymm3 {%k1}
5102; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm3[2,3,2,3]
5103; AVX512DQ-BW-NEXT:    movl $101455920, %ecx # imm = 0x60C1830
5104; AVX512DQ-BW-NEXT:    kmovd %ecx, %k1
5105; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm2, %ymm1 {%k1}
5106; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15]
5107; AVX512DQ-BW-NEXT:    # ymm2 = mem[0,1,0,1]
5108; AVX512DQ-BW-NEXT:    vpermw %ymm7, %ymm2, %ymm2
5109; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero
5110; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29]
5111; AVX512DQ-BW-NEXT:    vpor %ymm3, %ymm4, %ymm3
5112; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
5113; AVX512DQ-BW-NEXT:    movl $-2130574328, %ecx # imm = 0x81020408
5114; AVX512DQ-BW-NEXT:    kmovd %ecx, %k1
5115; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm2, %ymm3 {%k1}
5116; AVX512DQ-BW-NEXT:    movl $-507279602, %ecx # imm = 0xE1C3870E
5117; AVX512DQ-BW-NEXT:    kmovd %ecx, %k1
5118; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm3, %ymm1 {%k1}
5119; AVX512DQ-BW-NEXT:    vmovdqa %ymm1, 192(%rax)
5120; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, (%rax)
5121; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, 128(%rax)
5122; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
5123; AVX512DQ-BW-NEXT:    vzeroupper
5124; AVX512DQ-BW-NEXT:    retq
5125;
5126; AVX512DQ-BW-FCP-LABEL: store_i8_stride7_vf32:
5127; AVX512DQ-BW-FCP:       # %bb.0:
5128; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5129; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
5130; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm3
5131; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %ymm4
5132; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %ymm1
5133; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rcx), %ymm2
5134; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,1,0,1,14],zero,ymm1[14,15,0,1,14,15],zero,ymm1[13,14,15,16,17,16],zero,ymm1[30,31,30,31,16,17],zero,ymm1[31,28,29,30,31]
5135; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero
5136; AVX512DQ-BW-FCP-NEXT:    vpor %ymm0, %ymm5, %ymm0
5137; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm8
5138; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %xmm9
5139; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15]
5140; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
5141; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
5142; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm5, %zmm5
5143; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[0,1,14],zero,ymm3[12,13,0,1,14,15],zero,ymm3[3,12,13,2,3,16],zero,ymm3[30,31,28,29,16,17],zero,ymm3[31,18,19,28,29,18],zero
5144; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero,zero,ymm4[18]
5145; AVX512DQ-BW-FCP-NEXT:    vpor %ymm0, %ymm6, %ymm0
5146; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %xmm12
5147; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rcx), %xmm14
5148; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15]
5149; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
5150; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
5151; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm6, %zmm0
5152; AVX512DQ-BW-FCP-NEXT:    movabsq $435749858791416001, %rcx # imm = 0x60C1830183060C1
5153; AVX512DQ-BW-FCP-NEXT:    kmovq %rcx, %k1
5154; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm5, %zmm0 {%k1}
5155; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3]
5156; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm5[13],zero,zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero
5157; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = mem[0,1,2,3,0,1,2,3]
5158; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero
5159; AVX512DQ-BW-FCP-NEXT:    vpor %ymm7, %ymm10, %ymm7
5160; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r9), %xmm11
5161; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r8), %xmm13
5162; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15]
5163; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
5164; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1]
5165; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm10, %zmm10
5166; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3]
5167; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
5168; AVX512DQ-BW-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
5169; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm7, %ymm15, %ymm15
5170; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm16 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
5171; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm15, %zmm15
5172; AVX512DQ-BW-FCP-NEXT:    movabsq $2323999253380730912, %rcx # imm = 0x2040810204081020
5173; AVX512DQ-BW-FCP-NEXT:    kmovq %rcx, %k1
5174; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm15, %zmm10 {%k1}
5175; AVX512DQ-BW-FCP-NEXT:    movabsq $4066998693416279096, %rcx # imm = 0x3870E1C3870E1C38
5176; AVX512DQ-BW-FCP-NEXT:    kmovq %rcx, %k1
5177; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm10, %zmm0 {%k1}
5178; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm10
5179; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,zmm10[19],zero,zmm10[21,20,21,22],zero,zmm10[20],zero,zmm10[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm10[55],zero,zmm10[53,54,55,58],zero,zmm10[56],zero,zmm10[60,61,58,59]
5180; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm4, %zmm15
5181; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm15 = zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm15[21],zero,zmm15[19],zero,zero,zero,zero,zmm15[22],zero,zmm15[20],zero,zero,zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm15[57],zero,zmm15[55],zero,zero,zero,zero,zmm15[58],zero,zmm15[56],zero,zero,zero,zero
5182; AVX512DQ-BW-FCP-NEXT:    vporq %zmm10, %zmm15, %zmm10
5183; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm15 = zmm10[2,3,2,3,6,7,6,7]
5184; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm10
5185; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm10[18,19,20,21],zero,zmm10[19],zero,zmm10[25,26,27,22],zero,zmm10[20],zero,zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm10[55],zero,zero,zero,zero,zmm10[58],zero,zmm10[56],zero,zero,zero,zero,zmm10[59],zero
5186; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm16
5187; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm16 = zmm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[18],zero,zero,zero,zero,zmm16[21],zero,zmm16[19],zero,zero,zero,zero,zmm16[22],zero,zmm16[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[55],zero,zero,zero,zero,zmm16[58],zero,zmm16[56],zero,zero,zero,zero,zmm16[59],zero,zmm16[57]
5188; AVX512DQ-BW-FCP-NEXT:    vporq %zmm10, %zmm16, %zmm10
5189; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7]
5190; AVX512DQ-BW-FCP-NEXT:    movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060
5191; AVX512DQ-BW-FCP-NEXT:    kmovq %rcx, %k1
5192; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm15, %zmm10 {%k1}
5193; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28]
5194; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm7, %zmm15, %zmm15
5195; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm16 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm6[18],zero,zmm6[20,21,20,21],zero,zmm6[19],zero,zmm6[19,20,21,22],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm6[55],zero,zmm6[55,56,57,58],zero,zmm6[56],zero,zmm6[62,63]
5196; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm17 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[20],zero,zmm5[18],zero,zero,zero,zero,zmm5[21],zero,zmm5[19],zero,zero,zero,zero,zmm5[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm5[57],zero,zmm5[55],zero,zero,zero,zero,zmm5[58],zero,zmm5[56],zero,zero
5197; AVX512DQ-BW-FCP-NEXT:    vporq %zmm16, %zmm17, %zmm16
5198; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7]
5199; AVX512DQ-BW-FCP-NEXT:    movabsq $1161999626690365456, %rcx # imm = 0x1020408102040810
5200; AVX512DQ-BW-FCP-NEXT:    kmovq %rcx, %k1
5201; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm15, %zmm16 {%k1}
5202; AVX512DQ-BW-FCP-NEXT:    movabsq $2033499346708139548, %rcx # imm = 0x1C3870E1C3870E1C
5203; AVX512DQ-BW-FCP-NEXT:    kmovq %rcx, %k1
5204; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm16, %zmm10 {%k1}
5205; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u]
5206; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u]
5207; AVX512DQ-BW-FCP-NEXT:    vporq %xmm15, %xmm16, %xmm15
5208; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
5209; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
5210; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm12, %zmm12
5211; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm12 = zmm12[0,1,0,1,4,5,4,5]
5212; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero
5213; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm8[u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u,u,9]
5214; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
5215; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
5216; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
5217; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm14, %zmm8, %zmm8
5218; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5]
5219; AVX512DQ-BW-FCP-NEXT:    movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C
5220; AVX512DQ-BW-FCP-NEXT:    kmovq %rcx, %k1
5221; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm12, %zmm8 {%k1}
5222; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6]
5223; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero
5224; AVX512DQ-BW-FCP-NEXT:    vpor %xmm9, %xmm12, %xmm9
5225; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7]
5226; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
5227; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm9, %zmm11, %zmm9
5228; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm9 = zmm9[0,1,0,1,4,5,4,5]
5229; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20]
5230; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm7, %zmm11, %zmm11
5231; AVX512DQ-BW-FCP-NEXT:    movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040
5232; AVX512DQ-BW-FCP-NEXT:    kmovq %rcx, %k1
5233; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm11, %zmm9 {%k1}
5234; AVX512DQ-BW-FCP-NEXT:    movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870
5235; AVX512DQ-BW-FCP-NEXT:    kmovq %rcx, %k1
5236; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm9, %zmm8 {%k1}
5237; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29]
5238; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero,zero
5239; AVX512DQ-BW-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
5240; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
5241; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero
5242; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero
5243; AVX512DQ-BW-FCP-NEXT:    vpor %ymm2, %ymm1, %ymm1
5244; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
5245; AVX512DQ-BW-FCP-NEXT:    movl $101455920, %ecx # imm = 0x60C1830
5246; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k1
5247; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm3, %ymm1 {%k1}
5248; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15]
5249; AVX512DQ-BW-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
5250; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm7, %ymm2, %ymm2
5251; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero
5252; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29]
5253; AVX512DQ-BW-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
5254; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
5255; AVX512DQ-BW-FCP-NEXT:    movl $-2130574328, %ecx # imm = 0x81020408
5256; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k1
5257; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm2, %ymm3 {%k1}
5258; AVX512DQ-BW-FCP-NEXT:    movl $-507279602, %ecx # imm = 0xE1C3870E
5259; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k1
5260; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm3, %ymm1 {%k1}
5261; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm1, 192(%rax)
5262; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, (%rax)
5263; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, 128(%rax)
5264; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
5265; AVX512DQ-BW-FCP-NEXT:    vzeroupper
5266; AVX512DQ-BW-FCP-NEXT:    retq
5267  %in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 64
5268  %in.vec1 = load <32 x i8>, ptr %in.vecptr1, align 64
5269  %in.vec2 = load <32 x i8>, ptr %in.vecptr2, align 64
5270  %in.vec3 = load <32 x i8>, ptr %in.vecptr3, align 64
5271  %in.vec4 = load <32 x i8>, ptr %in.vecptr4, align 64
5272  %in.vec5 = load <32 x i8>, ptr %in.vecptr5, align 64
5273  %in.vec6 = load <32 x i8>, ptr %in.vecptr6, align 64
5274  %1 = shufflevector <32 x i8> %in.vec0, <32 x i8> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
5275  %2 = shufflevector <32 x i8> %in.vec2, <32 x i8> %in.vec3, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
5276  %3 = shufflevector <32 x i8> %in.vec4, <32 x i8> %in.vec5, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
5277  %4 = shufflevector <64 x i8> %1, <64 x i8> %2, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
5278  %5 = shufflevector <32 x i8> %in.vec6, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5279  %6 = shufflevector <64 x i8> %3, <64 x i8> %5, <96 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
5280  %7 = shufflevector <96 x i8> %6, <96 x i8> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5281  %8 = shufflevector <128 x i8> %4, <128 x i8> %7, <224 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223>
5282  %interleaved.vec = shufflevector <224 x i8> %8, <224 x i8> poison, <224 x i32> <i32 0, i32 32, i32 64, i32 96, i32 128, i32 160, i32 192, i32 1, i32 33, i32 65, i32 97, i32 129, i32 161, i32 193, i32 2, i32 34, i32 66, i32 98, i32 130, i32 162, i32 194, i32 3, i32 35, i32 67, i32 99, i32 131, i32 163, i32 195, i32 4, i32 36, i32 68, i32 100, i32 132, i32 164, i32 196, i32 5, i32 37, i32 69, i32 101, i32 133, i32 165, i32 197, i32 6, i32 38, i32 70, i32 102, i32 134, i32 166, i32 198, i32 7, i32 39, i32 71, i32 103, i32 135, i32 167, i32 199, i32 8, i32 40, i32 72, i32 104, i32 136, i32 168, i32 200, i32 9, i32 41, i32 73, i32 105, i32 137, i32 169, i32 201, i32 10, i32 42, i32 74, i32 106, i32 138, i32 170, i32 202, i32 11, i32 43, i32 75, i32 107, i32 139, i32 171, i32 203, i32 12, i32 44, i32 76, i32 108, i32 140, i32 172, i32 204, i32 13, i32 45, i32 77, i32 109, i32 141, i32 173, i32 205, i32 14, i32 46, i32 78, i32 110, i32 142, i32 174, i32 206, i32 15, i32 47, i32 79, i32 111, i32 143, i32 175, i32 207, i32 16, i32 48, i32 80, i32 112, i32 144, i32 176, i32 208, i32 17, i32 49, i32 81, i32 113, i32 145, i32 177, i32 209, i32 18, i32 50, i32 82, i32 114, i32 146, i32 178, i32 210, i32 19, i32 51, i32 83, i32 115, i32 147, i32 179, i32 211, i32 20, i32 52, i32 84, i32 116, i32 148, i32 180, i32 212, i32 21, i32 53, i32 85, i32 117, i32 149, i32 181, i32 213, i32 22, i32 54, i32 86, i32 118, i32 150, i32 182, i32 214, i32 23, i32 55, i32 87, i32 119, i32 151, i32 183, i32 215, i32 24, i32 56, i32 88, i32 120, i32 152, i32 184, i32 216, i32 25, i32 57, i32 89, i32 121, i32 153, i32 185, i32 217, i32 26, i32 58, i32 90, i32 122, i32 154, i32 186, i32 218, i32 27, i32 59, i32 91, i32 123, i32 155, i32 187, i32 219, i32 28, i32 60, i32 92, i32 124, i32 156, i32 188, i32 220, i32 29, i32 61, i32 93, i32 125, i32 157, i32 189, i32 221, i32 30, i32 62, i32 94, i32 126, i32 158, i32 190, i32 222, i32 31, i32 63, i32 95, i32 127, i32 159, i32 191, i32 223>
5283  store <224 x i8> %interleaved.vec, ptr %out.vec, align 64
5284  ret void
5285}
5286
5287define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind {
5288; SSE-LABEL: store_i8_stride7_vf64:
5289; SSE:       # %bb.0:
5290; SSE-NEXT:    subq $648, %rsp # imm = 0x288
5291; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5292; SSE-NEXT:    movdqa 48(%rdi), %xmm14
5293; SSE-NEXT:    movdqa 48(%rsi), %xmm11
5294; SSE-NEXT:    movdqa 48(%rdx), %xmm3
5295; SSE-NEXT:    movdqa 48(%rcx), %xmm10
5296; SSE-NEXT:    movdqa 48(%r8), %xmm9
5297; SSE-NEXT:    movdqa 48(%r9), %xmm8
5298; SSE-NEXT:    movdqa 48(%rax), %xmm13
5299; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm14[3,3,3,3,4,5,6,7]
5300; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
5301; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
5302; SSE-NEXT:    pand %xmm2, %xmm0
5303; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[2,1,2,3]
5304; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5305; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
5306; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,0,3]
5307; SSE-NEXT:    pandn %xmm1, %xmm2
5308; SSE-NEXT:    por %xmm0, %xmm2
5309; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
5310; SSE-NEXT:    pand %xmm12, %xmm2
5311; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm3[3,3,3,3,4,5,6,7]
5312; SSE-NEXT:    movdqa %xmm3, %xmm6
5313; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
5314; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255]
5315; SSE-NEXT:    pand %xmm4, %xmm0
5316; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm10[2,1,2,3]
5317; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5318; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
5319; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,0,3]
5320; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7]
5321; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
5322; SSE-NEXT:    pandn %xmm3, %xmm4
5323; SSE-NEXT:    por %xmm0, %xmm4
5324; SSE-NEXT:    movdqa %xmm12, %xmm0
5325; SSE-NEXT:    pandn %xmm4, %xmm0
5326; SSE-NEXT:    por %xmm2, %xmm0
5327; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255]
5328; SSE-NEXT:    pand %xmm1, %xmm0
5329; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm9[3,3,3,3,4,5,6,7]
5330; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,2]
5331; SSE-NEXT:    movdqa %xmm1, %xmm3
5332; SSE-NEXT:    movdqa %xmm1, %xmm5
5333; SSE-NEXT:    pandn %xmm2, %xmm3
5334; SSE-NEXT:    por %xmm0, %xmm3
5335; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255]
5336; SSE-NEXT:    pand %xmm7, %xmm3
5337; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[2,1,2,3]
5338; SSE-NEXT:    movdqa %xmm8, %xmm1
5339; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5340; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
5341; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0]
5342; SSE-NEXT:    movdqa %xmm7, %xmm4
5343; SSE-NEXT:    pandn %xmm0, %xmm4
5344; SSE-NEXT:    por %xmm3, %xmm4
5345; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255]
5346; SSE-NEXT:    pand %xmm2, %xmm4
5347; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm13[3,3,3,3,4,5,6,7]
5348; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
5349; SSE-NEXT:    movdqa %xmm2, %xmm3
5350; SSE-NEXT:    pandn %xmm0, %xmm3
5351; SSE-NEXT:    por %xmm4, %xmm3
5352; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5353; SSE-NEXT:    movdqa %xmm11, %xmm0
5354; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15]
5355; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,5,7]
5356; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
5357; SSE-NEXT:    movdqa %xmm5, %xmm4
5358; SSE-NEXT:    pandn %xmm3, %xmm5
5359; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,6,6,6,6]
5360; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
5361; SSE-NEXT:    pand %xmm4, %xmm3
5362; SSE-NEXT:    por %xmm3, %xmm5
5363; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
5364; SSE-NEXT:    movdqa %xmm4, %xmm3
5365; SSE-NEXT:    pandn %xmm5, %xmm3
5366; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,6,6,6,6]
5367; SSE-NEXT:    movdqa %xmm6, %xmm15
5368; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2]
5369; SSE-NEXT:    movdqa %xmm7, %xmm6
5370; SSE-NEXT:    pandn %xmm5, %xmm6
5371; SSE-NEXT:    movdqa %xmm10, %xmm5
5372; SSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15]
5373; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[2,1,2,3]
5374; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,0,4,5,6,7]
5375; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
5376; SSE-NEXT:    pand %xmm7, %xmm8
5377; SSE-NEXT:    por %xmm6, %xmm8
5378; SSE-NEXT:    pand %xmm4, %xmm8
5379; SSE-NEXT:    por %xmm3, %xmm8
5380; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,5,6,6,7]
5381; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,1,3,2]
5382; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0]
5383; SSE-NEXT:    movdqa %xmm4, %xmm6
5384; SSE-NEXT:    pandn %xmm3, %xmm6
5385; SSE-NEXT:    pand %xmm4, %xmm8
5386; SSE-NEXT:    por %xmm8, %xmm6
5387; SSE-NEXT:    movdqa %xmm1, %xmm3
5388; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
5389; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[1,1,2,3]
5390; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
5391; SSE-NEXT:    movdqa %xmm4, %xmm11
5392; SSE-NEXT:    pandn %xmm8, %xmm11
5393; SSE-NEXT:    pand %xmm4, %xmm6
5394; SSE-NEXT:    por %xmm6, %xmm11
5395; SSE-NEXT:    movdqa %xmm13, %xmm10
5396; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm13[0,1,2,3,4,5,6,6]
5397; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3]
5398; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
5399; SSE-NEXT:    movdqa %xmm1, %xmm8
5400; SSE-NEXT:    pandn %xmm6, %xmm8
5401; SSE-NEXT:    pand %xmm1, %xmm11
5402; SSE-NEXT:    por %xmm11, %xmm8
5403; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5404; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,1,1,3]
5405; SSE-NEXT:    movdqa %xmm4, %xmm8
5406; SSE-NEXT:    pandn %xmm6, %xmm8
5407; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm15[0,1,2,3,4,5,5,7]
5408; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5409; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,1,3,2]
5410; SSE-NEXT:    pand %xmm4, %xmm6
5411; SSE-NEXT:    por %xmm8, %xmm6
5412; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0]
5413; SSE-NEXT:    movdqa %xmm1, %xmm8
5414; SSE-NEXT:    pandn %xmm6, %xmm8
5415; SSE-NEXT:    movdqa %xmm14, %xmm13
5416; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5417; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,5,5,5,5]
5418; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2]
5419; SSE-NEXT:    movdqa %xmm7, %xmm11
5420; SSE-NEXT:    pandn %xmm6, %xmm11
5421; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm0[1,2,2,3,4,5,6,7]
5422; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1]
5423; SSE-NEXT:    pand %xmm7, %xmm6
5424; SSE-NEXT:    por %xmm11, %xmm6
5425; SSE-NEXT:    pand %xmm1, %xmm6
5426; SSE-NEXT:    por %xmm8, %xmm6
5427; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,4,4,6,5]
5428; SSE-NEXT:    movdqa %xmm9, %xmm1
5429; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5430; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[2,1,3,3]
5431; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
5432; SSE-NEXT:    movdqa %xmm4, %xmm11
5433; SSE-NEXT:    pandn %xmm8, %xmm11
5434; SSE-NEXT:    pand %xmm4, %xmm6
5435; SSE-NEXT:    por %xmm6, %xmm11
5436; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm3[1,2,2,3,4,5,6,7]
5437; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0]
5438; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255]
5439; SSE-NEXT:    movdqa %xmm9, %xmm8
5440; SSE-NEXT:    pandn %xmm6, %xmm8
5441; SSE-NEXT:    pand %xmm9, %xmm11
5442; SSE-NEXT:    movdqa %xmm9, %xmm14
5443; SSE-NEXT:    por %xmm11, %xmm8
5444; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,5,5,7]
5445; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5446; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
5447; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255]
5448; SSE-NEXT:    movdqa %xmm11, %xmm9
5449; SSE-NEXT:    pandn %xmm6, %xmm9
5450; SSE-NEXT:    pand %xmm11, %xmm8
5451; SSE-NEXT:    por %xmm8, %xmm9
5452; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5453; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,7,7]
5454; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
5455; SSE-NEXT:    movdqa %xmm11, %xmm6
5456; SSE-NEXT:    pandn %xmm5, %xmm6
5457; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,7,7,7,7]
5458; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2]
5459; SSE-NEXT:    pand %xmm11, %xmm5
5460; SSE-NEXT:    por %xmm5, %xmm6
5461; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
5462; SSE-NEXT:    movdqa %xmm8, %xmm5
5463; SSE-NEXT:    pandn %xmm6, %xmm5
5464; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
5465; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
5466; SSE-NEXT:    movdqa %xmm4, %xmm6
5467; SSE-NEXT:    pandn %xmm0, %xmm6
5468; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,7,7,7,7]
5469; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
5470; SSE-NEXT:    pand %xmm4, %xmm0
5471; SSE-NEXT:    por %xmm0, %xmm6
5472; SSE-NEXT:    pand %xmm8, %xmm6
5473; SSE-NEXT:    por %xmm5, %xmm6
5474; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,7,7,7,7]
5475; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
5476; SSE-NEXT:    movdqa %xmm7, %xmm5
5477; SSE-NEXT:    pandn %xmm0, %xmm5
5478; SSE-NEXT:    pand %xmm7, %xmm6
5479; SSE-NEXT:    por %xmm6, %xmm5
5480; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,5,6,6,7]
5481; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3]
5482; SSE-NEXT:    movdqa %xmm2, %xmm3
5483; SSE-NEXT:    pandn %xmm0, %xmm3
5484; SSE-NEXT:    pand %xmm2, %xmm5
5485; SSE-NEXT:    por %xmm5, %xmm3
5486; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,6,7,7,7]
5487; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2]
5488; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0]
5489; SSE-NEXT:    movdqa %xmm5, %xmm1
5490; SSE-NEXT:    pandn %xmm0, %xmm1
5491; SSE-NEXT:    pand %xmm5, %xmm3
5492; SSE-NEXT:    por %xmm3, %xmm1
5493; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5494; SSE-NEXT:    movdqa (%rsi), %xmm0
5495; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5496; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
5497; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
5498; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,3]
5499; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
5500; SSE-NEXT:    movdqa %xmm4, %xmm3
5501; SSE-NEXT:    pandn %xmm0, %xmm3
5502; SSE-NEXT:    movdqa (%rdi), %xmm0
5503; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5504; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
5505; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
5506; SSE-NEXT:    pand %xmm4, %xmm0
5507; SSE-NEXT:    movdqa %xmm4, %xmm11
5508; SSE-NEXT:    por %xmm0, %xmm3
5509; SSE-NEXT:    movdqa (%rcx), %xmm0
5510; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5511; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
5512; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
5513; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,0,3]
5514; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
5515; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
5516; SSE-NEXT:    movdqa %xmm14, %xmm5
5517; SSE-NEXT:    pandn %xmm0, %xmm5
5518; SSE-NEXT:    movdqa (%rdx), %xmm0
5519; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5520; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
5521; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
5522; SSE-NEXT:    pand %xmm14, %xmm0
5523; SSE-NEXT:    movdqa %xmm14, %xmm9
5524; SSE-NEXT:    por %xmm0, %xmm5
5525; SSE-NEXT:    movdqa %xmm12, %xmm0
5526; SSE-NEXT:    pandn %xmm5, %xmm0
5527; SSE-NEXT:    pand %xmm12, %xmm3
5528; SSE-NEXT:    por %xmm3, %xmm0
5529; SSE-NEXT:    movdqa (%r9), %xmm15
5530; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm15[2,1,2,3]
5531; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5532; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
5533; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,2,0]
5534; SSE-NEXT:    movdqa %xmm7, %xmm6
5535; SSE-NEXT:    pandn %xmm3, %xmm6
5536; SSE-NEXT:    movdqa (%r8), %xmm8
5537; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7]
5538; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5539; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
5540; SSE-NEXT:    pand %xmm7, %xmm3
5541; SSE-NEXT:    por %xmm3, %xmm6
5542; SSE-NEXT:    movdqa (%rax), %xmm4
5543; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm4[3,3,3,3,4,5,6,7]
5544; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5545; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
5546; SSE-NEXT:    movdqa %xmm2, %xmm14
5547; SSE-NEXT:    pandn %xmm3, %xmm14
5548; SSE-NEXT:    pand %xmm2, %xmm6
5549; SSE-NEXT:    por %xmm6, %xmm14
5550; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
5551; SSE-NEXT:    movdqa %xmm10, %xmm1
5552; SSE-NEXT:    pandn %xmm14, %xmm1
5553; SSE-NEXT:    pand %xmm10, %xmm0
5554; SSE-NEXT:    por %xmm0, %xmm1
5555; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5556; SSE-NEXT:    movdqa 16(%rsi), %xmm0
5557; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5558; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
5559; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
5560; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,3]
5561; SSE-NEXT:    movdqa %xmm11, %xmm3
5562; SSE-NEXT:    pandn %xmm0, %xmm3
5563; SSE-NEXT:    movdqa 16(%rdi), %xmm0
5564; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5565; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
5566; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
5567; SSE-NEXT:    pand %xmm11, %xmm0
5568; SSE-NEXT:    por %xmm0, %xmm3
5569; SSE-NEXT:    movdqa 16(%rcx), %xmm0
5570; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5571; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
5572; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
5573; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,0,3]
5574; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
5575; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
5576; SSE-NEXT:    movdqa %xmm9, %xmm6
5577; SSE-NEXT:    pandn %xmm0, %xmm6
5578; SSE-NEXT:    movdqa 16(%rdx), %xmm0
5579; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5580; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
5581; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
5582; SSE-NEXT:    pand %xmm9, %xmm0
5583; SSE-NEXT:    por %xmm0, %xmm6
5584; SSE-NEXT:    movdqa %xmm12, %xmm0
5585; SSE-NEXT:    pandn %xmm6, %xmm0
5586; SSE-NEXT:    pand %xmm12, %xmm3
5587; SSE-NEXT:    por %xmm3, %xmm0
5588; SSE-NEXT:    movdqa 16(%r9), %xmm1
5589; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5590; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,1,2,3]
5591; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
5592; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,2,0]
5593; SSE-NEXT:    movdqa %xmm7, %xmm6
5594; SSE-NEXT:    pandn %xmm3, %xmm6
5595; SSE-NEXT:    movdqa 16(%r8), %xmm1
5596; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5597; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7]
5598; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
5599; SSE-NEXT:    pand %xmm7, %xmm3
5600; SSE-NEXT:    por %xmm3, %xmm6
5601; SSE-NEXT:    movdqa 16(%rax), %xmm1
5602; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5603; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7]
5604; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
5605; SSE-NEXT:    movdqa %xmm2, %xmm14
5606; SSE-NEXT:    pandn %xmm3, %xmm14
5607; SSE-NEXT:    pand %xmm2, %xmm6
5608; SSE-NEXT:    por %xmm6, %xmm14
5609; SSE-NEXT:    movdqa %xmm10, %xmm1
5610; SSE-NEXT:    pandn %xmm14, %xmm1
5611; SSE-NEXT:    pand %xmm10, %xmm0
5612; SSE-NEXT:    por %xmm0, %xmm1
5613; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5614; SSE-NEXT:    movdqa 32(%rsi), %xmm0
5615; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5616; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
5617; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
5618; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,1,0,3]
5619; SSE-NEXT:    movdqa %xmm11, %xmm0
5620; SSE-NEXT:    pandn %xmm3, %xmm0
5621; SSE-NEXT:    movdqa 32(%rdi), %xmm1
5622; SSE-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
5623; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7]
5624; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
5625; SSE-NEXT:    pand %xmm11, %xmm3
5626; SSE-NEXT:    por %xmm3, %xmm0
5627; SSE-NEXT:    movdqa 32(%rcx), %xmm1
5628; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5629; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,1,2,3]
5630; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
5631; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,0,3]
5632; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7]
5633; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
5634; SSE-NEXT:    movdqa %xmm9, %xmm5
5635; SSE-NEXT:    movdqa %xmm9, %xmm6
5636; SSE-NEXT:    pandn %xmm3, %xmm6
5637; SSE-NEXT:    movdqa 32(%rdx), %xmm9
5638; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm9[3,3,3,3,4,5,6,7]
5639; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5640; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
5641; SSE-NEXT:    pand %xmm5, %xmm3
5642; SSE-NEXT:    por %xmm3, %xmm6
5643; SSE-NEXT:    pand %xmm12, %xmm0
5644; SSE-NEXT:    pandn %xmm6, %xmm12
5645; SSE-NEXT:    por %xmm0, %xmm12
5646; SSE-NEXT:    movdqa 32(%r9), %xmm0
5647; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5648; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
5649; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
5650; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0]
5651; SSE-NEXT:    movdqa %xmm7, %xmm3
5652; SSE-NEXT:    pandn %xmm0, %xmm3
5653; SSE-NEXT:    movdqa 32(%r8), %xmm11
5654; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7]
5655; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5656; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
5657; SSE-NEXT:    pand %xmm7, %xmm0
5658; SSE-NEXT:    por %xmm0, %xmm3
5659; SSE-NEXT:    pand %xmm2, %xmm3
5660; SSE-NEXT:    movdqa 32(%rax), %xmm13
5661; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm13[3,3,3,3,4,5,6,7]
5662; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5663; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
5664; SSE-NEXT:    pandn %xmm0, %xmm2
5665; SSE-NEXT:    por %xmm3, %xmm2
5666; SSE-NEXT:    pand %xmm10, %xmm12
5667; SSE-NEXT:    pandn %xmm2, %xmm10
5668; SSE-NEXT:    por %xmm12, %xmm10
5669; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5670; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5671; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
5672; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5673; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7]
5674; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
5675; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255]
5676; SSE-NEXT:    movdqa %xmm12, %xmm1
5677; SSE-NEXT:    pandn %xmm0, %xmm1
5678; SSE-NEXT:    pshufhw $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
5679; SSE-NEXT:    # xmm0 = mem[0,1,2,3,6,6,6,6]
5680; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
5681; SSE-NEXT:    pand %xmm12, %xmm0
5682; SSE-NEXT:    por %xmm0, %xmm1
5683; SSE-NEXT:    movdqa {{.*#+}} xmm14 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
5684; SSE-NEXT:    movdqa %xmm14, %xmm0
5685; SSE-NEXT:    pandn %xmm1, %xmm0
5686; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5687; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,6,6,6]
5688; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
5689; SSE-NEXT:    movdqa %xmm7, %xmm2
5690; SSE-NEXT:    pandn %xmm1, %xmm2
5691; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5692; SSE-NEXT:    movdqa %xmm5, %xmm1
5693; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
5694; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5695; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
5696; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7]
5697; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
5698; SSE-NEXT:    pand %xmm7, %xmm1
5699; SSE-NEXT:    por %xmm2, %xmm1
5700; SSE-NEXT:    pand %xmm14, %xmm1
5701; SSE-NEXT:    por %xmm0, %xmm1
5702; SSE-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
5703; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5704; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm15[1,1,2,3]
5705; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
5706; SSE-NEXT:    movdqa %xmm0, %xmm3
5707; SSE-NEXT:    pandn %xmm2, %xmm3
5708; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,5,6,6,7]
5709; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2]
5710; SSE-NEXT:    pand %xmm0, %xmm2
5711; SSE-NEXT:    por %xmm3, %xmm2
5712; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,6,6]
5713; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3]
5714; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
5715; SSE-NEXT:    movdqa %xmm4, %xmm15
5716; SSE-NEXT:    pandn %xmm3, %xmm15
5717; SSE-NEXT:    pand %xmm4, %xmm2
5718; SSE-NEXT:    por %xmm2, %xmm15
5719; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
5720; SSE-NEXT:    movdqa %xmm10, %xmm0
5721; SSE-NEXT:    pandn %xmm15, %xmm0
5722; SSE-NEXT:    pand %xmm10, %xmm1
5723; SSE-NEXT:    por %xmm1, %xmm0
5724; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5725; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5726; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
5727; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5728; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7]
5729; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
5730; SSE-NEXT:    movdqa %xmm12, %xmm8
5731; SSE-NEXT:    movdqa %xmm12, %xmm2
5732; SSE-NEXT:    pandn %xmm1, %xmm2
5733; SSE-NEXT:    pshufhw $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5734; SSE-NEXT:    # xmm1 = mem[0,1,2,3,6,6,6,6]
5735; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
5736; SSE-NEXT:    pand %xmm12, %xmm1
5737; SSE-NEXT:    por %xmm1, %xmm2
5738; SSE-NEXT:    movdqa %xmm14, %xmm12
5739; SSE-NEXT:    movdqa %xmm14, %xmm3
5740; SSE-NEXT:    pandn %xmm2, %xmm3
5741; SSE-NEXT:    pshufhw $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5742; SSE-NEXT:    # xmm1 = mem[0,1,2,3,6,6,6,6]
5743; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
5744; SSE-NEXT:    movdqa %xmm7, %xmm2
5745; SSE-NEXT:    pandn %xmm1, %xmm2
5746; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5747; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
5748; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5749; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3]
5750; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7]
5751; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
5752; SSE-NEXT:    pand %xmm7, %xmm1
5753; SSE-NEXT:    por %xmm2, %xmm1
5754; SSE-NEXT:    pand %xmm14, %xmm1
5755; SSE-NEXT:    por %xmm3, %xmm1
5756; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5757; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
5758; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5759; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
5760; SSE-NEXT:    movdqa {{.*#+}} xmm14 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
5761; SSE-NEXT:    movdqa %xmm14, %xmm3
5762; SSE-NEXT:    pandn %xmm2, %xmm3
5763; SSE-NEXT:    pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5764; SSE-NEXT:    # xmm2 = mem[0,1,2,3,5,6,6,7]
5765; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2]
5766; SSE-NEXT:    pand %xmm14, %xmm2
5767; SSE-NEXT:    por %xmm3, %xmm2
5768; SSE-NEXT:    pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5769; SSE-NEXT:    # xmm3 = mem[0,1,2,3,4,5,6,6]
5770; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3]
5771; SSE-NEXT:    movdqa %xmm4, %xmm15
5772; SSE-NEXT:    pandn %xmm3, %xmm15
5773; SSE-NEXT:    pand %xmm4, %xmm2
5774; SSE-NEXT:    por %xmm2, %xmm15
5775; SSE-NEXT:    movdqa %xmm10, %xmm0
5776; SSE-NEXT:    pandn %xmm15, %xmm0
5777; SSE-NEXT:    pand %xmm10, %xmm1
5778; SSE-NEXT:    por %xmm1, %xmm0
5779; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5780; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5781; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
5782; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5783; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7]
5784; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
5785; SSE-NEXT:    movdqa %xmm8, %xmm2
5786; SSE-NEXT:    pandn %xmm1, %xmm2
5787; SSE-NEXT:    pshufhw $170, (%rsp), %xmm1 # 16-byte Folded Reload
5788; SSE-NEXT:    # xmm1 = mem[0,1,2,3,6,6,6,6]
5789; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
5790; SSE-NEXT:    pand %xmm8, %xmm1
5791; SSE-NEXT:    por %xmm1, %xmm2
5792; SSE-NEXT:    movdqa %xmm12, %xmm3
5793; SSE-NEXT:    pandn %xmm2, %xmm3
5794; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,6,6,6,6]
5795; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
5796; SSE-NEXT:    movdqa %xmm7, %xmm2
5797; SSE-NEXT:    pandn %xmm1, %xmm2
5798; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5799; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
5800; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5801; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3]
5802; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7]
5803; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
5804; SSE-NEXT:    pand %xmm7, %xmm1
5805; SSE-NEXT:    por %xmm2, %xmm1
5806; SSE-NEXT:    pand %xmm12, %xmm1
5807; SSE-NEXT:    por %xmm3, %xmm1
5808; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5809; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
5810; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5811; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
5812; SSE-NEXT:    movdqa %xmm14, %xmm3
5813; SSE-NEXT:    pandn %xmm2, %xmm3
5814; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,5,6,6,7]
5815; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2]
5816; SSE-NEXT:    pand %xmm14, %xmm2
5817; SSE-NEXT:    por %xmm3, %xmm2
5818; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,5,6,6]
5819; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3]
5820; SSE-NEXT:    movdqa %xmm4, %xmm15
5821; SSE-NEXT:    pandn %xmm3, %xmm15
5822; SSE-NEXT:    pand %xmm4, %xmm2
5823; SSE-NEXT:    por %xmm2, %xmm15
5824; SSE-NEXT:    pand %xmm10, %xmm1
5825; SSE-NEXT:    pandn %xmm15, %xmm10
5826; SSE-NEXT:    por %xmm1, %xmm10
5827; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5828; SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
5829; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm5[0,0,2,1,4,5,6,7]
5830; SSE-NEXT:    movdqa %xmm5, %xmm14
5831; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
5832; SSE-NEXT:    movdqa %xmm4, %xmm2
5833; SSE-NEXT:    pandn %xmm1, %xmm2
5834; SSE-NEXT:    movdqa %xmm6, %xmm8
5835; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm6[0,0,0,0,4,5,6,7]
5836; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
5837; SSE-NEXT:    pand %xmm4, %xmm1
5838; SSE-NEXT:    por %xmm1, %xmm2
5839; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
5840; SSE-NEXT:    movdqa %xmm0, %xmm15
5841; SSE-NEXT:    pandn %xmm2, %xmm15
5842; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5843; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
5844; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm4[0,2,1,3,4,5,6,7]
5845; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,1,0]
5846; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0]
5847; SSE-NEXT:    movdqa %xmm9, %xmm1
5848; SSE-NEXT:    pandn %xmm2, %xmm1
5849; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5850; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7]
5851; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1]
5852; SSE-NEXT:    pand %xmm9, %xmm2
5853; SSE-NEXT:    movdqa %xmm9, %xmm6
5854; SSE-NEXT:    por %xmm2, %xmm1
5855; SSE-NEXT:    pand %xmm0, %xmm1
5856; SSE-NEXT:    por %xmm15, %xmm1
5857; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5858; SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
5859; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm5[0,1,1,3,4,5,6,7]
5860; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1]
5861; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255]
5862; SSE-NEXT:    movdqa %xmm0, %xmm15
5863; SSE-NEXT:    pandn %xmm2, %xmm15
5864; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5865; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm13[0,0,0,0,4,5,6,7]
5866; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
5867; SSE-NEXT:    pand %xmm0, %xmm2
5868; SSE-NEXT:    por %xmm2, %xmm15
5869; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
5870; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm11[0,0,0,0,4,5,6,7]
5871; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
5872; SSE-NEXT:    movdqa %xmm7, %xmm10
5873; SSE-NEXT:    pandn %xmm2, %xmm10
5874; SSE-NEXT:    pand %xmm7, %xmm15
5875; SSE-NEXT:    por %xmm15, %xmm10
5876; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
5877; SSE-NEXT:    movdqa %xmm2, %xmm0
5878; SSE-NEXT:    pandn %xmm10, %xmm0
5879; SSE-NEXT:    pand %xmm2, %xmm1
5880; SSE-NEXT:    por %xmm1, %xmm0
5881; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5882; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,1,2,2]
5883; SSE-NEXT:    movdqa %xmm7, %xmm2
5884; SSE-NEXT:    pandn %xmm1, %xmm2
5885; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm3[1,1,2,3,4,5,6,7]
5886; SSE-NEXT:    movdqa %xmm3, %xmm9
5887; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm1[0,0,2,1]
5888; SSE-NEXT:    pand %xmm7, %xmm10
5889; SSE-NEXT:    por %xmm2, %xmm10
5890; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
5891; SSE-NEXT:    movdqa %xmm0, %xmm15
5892; SSE-NEXT:    movdqa %xmm0, %xmm12
5893; SSE-NEXT:    pandn %xmm10, %xmm15
5894; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm14[1,1,2,1]
5895; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,7,5,6,4]
5896; SSE-NEXT:    movdqa %xmm6, %xmm0
5897; SSE-NEXT:    movdqa %xmm6, %xmm2
5898; SSE-NEXT:    pandn %xmm10, %xmm2
5899; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm8[1,1,2,2,4,5,6,7]
5900; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1]
5901; SSE-NEXT:    pand %xmm0, %xmm10
5902; SSE-NEXT:    por %xmm10, %xmm2
5903; SSE-NEXT:    pand %xmm12, %xmm2
5904; SSE-NEXT:    por %xmm15, %xmm2
5905; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm5[0,2,2,3,4,5,6,7]
5906; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,1,1,3]
5907; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
5908; SSE-NEXT:    movdqa %xmm0, %xmm15
5909; SSE-NEXT:    pandn %xmm10, %xmm15
5910; SSE-NEXT:    movdqa %xmm13, %xmm3
5911; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm13[1,1,1,1,4,5,6,7]
5912; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,0,0,0]
5913; SSE-NEXT:    pand %xmm0, %xmm10
5914; SSE-NEXT:    por %xmm10, %xmm15
5915; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm11[1,1,1,1,4,5,6,7]
5916; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,0,0,0]
5917; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255]
5918; SSE-NEXT:    movdqa %xmm13, %xmm0
5919; SSE-NEXT:    pandn %xmm10, %xmm0
5920; SSE-NEXT:    pand %xmm13, %xmm15
5921; SSE-NEXT:    por %xmm15, %xmm0
5922; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
5923; SSE-NEXT:    movdqa %xmm10, %xmm1
5924; SSE-NEXT:    pandn %xmm0, %xmm1
5925; SSE-NEXT:    pand %xmm10, %xmm2
5926; SSE-NEXT:    por %xmm2, %xmm1
5927; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5928; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3]
5929; SSE-NEXT:    movdqa %xmm7, %xmm2
5930; SSE-NEXT:    pandn %xmm0, %xmm2
5931; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm8[2,1,3,3,4,5,6,7]
5932; SSE-NEXT:    movdqa %xmm8, %xmm10
5933; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
5934; SSE-NEXT:    pand %xmm7, %xmm0
5935; SSE-NEXT:    por %xmm2, %xmm0
5936; SSE-NEXT:    movdqa %xmm12, %xmm2
5937; SSE-NEXT:    pandn %xmm0, %xmm2
5938; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,5,6,6,7]
5939; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
5940; SSE-NEXT:    movdqa %xmm13, %xmm15
5941; SSE-NEXT:    movdqa %xmm13, %xmm8
5942; SSE-NEXT:    pandn %xmm0, %xmm8
5943; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7]
5944; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
5945; SSE-NEXT:    pand %xmm13, %xmm0
5946; SSE-NEXT:    por %xmm0, %xmm8
5947; SSE-NEXT:    pand %xmm12, %xmm8
5948; SSE-NEXT:    por %xmm2, %xmm8
5949; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255]
5950; SSE-NEXT:    movdqa %xmm1, %xmm0
5951; SSE-NEXT:    movdqa %xmm1, %xmm6
5952; SSE-NEXT:    pandn %xmm8, %xmm0
5953; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,5,7]
5954; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2]
5955; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0]
5956; SSE-NEXT:    movdqa %xmm1, %xmm5
5957; SSE-NEXT:    pandn %xmm2, %xmm5
5958; SSE-NEXT:    movdqa %xmm3, %xmm4
5959; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm3[2,2,2,3,4,5,6,7]
5960; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1]
5961; SSE-NEXT:    pand %xmm1, %xmm2
5962; SSE-NEXT:    por %xmm2, %xmm5
5963; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm11[2,2,2,2,4,5,6,7]
5964; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
5965; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
5966; SSE-NEXT:    movdqa %xmm1, %xmm8
5967; SSE-NEXT:    pandn %xmm2, %xmm8
5968; SSE-NEXT:    pand %xmm1, %xmm5
5969; SSE-NEXT:    por %xmm5, %xmm8
5970; SSE-NEXT:    pand %xmm6, %xmm8
5971; SSE-NEXT:    por %xmm0, %xmm8
5972; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5973; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5974; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,1,1,3]
5975; SSE-NEXT:    movdqa %xmm1, %xmm2
5976; SSE-NEXT:    pandn %xmm0, %xmm2
5977; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,7]
5978; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2]
5979; SSE-NEXT:    pand %xmm1, %xmm0
5980; SSE-NEXT:    por %xmm2, %xmm0
5981; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0]
5982; SSE-NEXT:    movdqa %xmm1, %xmm5
5983; SSE-NEXT:    pandn %xmm0, %xmm5
5984; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,5,5,5,5]
5985; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
5986; SSE-NEXT:    movdqa %xmm7, %xmm8
5987; SSE-NEXT:    pandn %xmm0, %xmm8
5988; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5989; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm14[1,2,2,3,4,5,6,7]
5990; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,1]
5991; SSE-NEXT:    pand %xmm7, %xmm2
5992; SSE-NEXT:    por %xmm8, %xmm2
5993; SSE-NEXT:    pand %xmm1, %xmm2
5994; SSE-NEXT:    por %xmm5, %xmm2
5995; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5996; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm13[1,2,2,3,4,5,6,7]
5997; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5998; SSE-NEXT:    movdqa %xmm15, %xmm5
5999; SSE-NEXT:    pandn %xmm0, %xmm5
6000; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5]
6001; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
6002; SSE-NEXT:    pand %xmm15, %xmm0
6003; SSE-NEXT:    por %xmm0, %xmm5
6004; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,5,7]
6005; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
6006; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255]
6007; SSE-NEXT:    movdqa %xmm12, %xmm8
6008; SSE-NEXT:    pandn %xmm0, %xmm8
6009; SSE-NEXT:    pand %xmm12, %xmm5
6010; SSE-NEXT:    por %xmm5, %xmm8
6011; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
6012; SSE-NEXT:    movdqa %xmm3, %xmm0
6013; SSE-NEXT:    pandn %xmm8, %xmm0
6014; SSE-NEXT:    pand %xmm3, %xmm2
6015; SSE-NEXT:    por %xmm2, %xmm0
6016; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6017; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,6,5,7,7]
6018; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
6019; SSE-NEXT:    movdqa %xmm12, %xmm2
6020; SSE-NEXT:    pandn %xmm0, %xmm2
6021; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,7,7,7]
6022; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
6023; SSE-NEXT:    pand %xmm12, %xmm0
6024; SSE-NEXT:    por %xmm0, %xmm2
6025; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
6026; SSE-NEXT:    movdqa %xmm5, %xmm0
6027; SSE-NEXT:    pandn %xmm2, %xmm0
6028; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,4,6,6,7]
6029; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[2,1,3,3]
6030; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
6031; SSE-NEXT:    movdqa %xmm3, %xmm2
6032; SSE-NEXT:    pandn %xmm6, %xmm2
6033; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,7,7,7,7]
6034; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2]
6035; SSE-NEXT:    pand %xmm3, %xmm6
6036; SSE-NEXT:    movdqa %xmm3, %xmm10
6037; SSE-NEXT:    por %xmm6, %xmm2
6038; SSE-NEXT:    pand %xmm5, %xmm2
6039; SSE-NEXT:    por %xmm0, %xmm2
6040; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7]
6041; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
6042; SSE-NEXT:    movdqa %xmm7, %xmm6
6043; SSE-NEXT:    pandn %xmm0, %xmm6
6044; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,6,6,7]
6045; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3]
6046; SSE-NEXT:    pand %xmm7, %xmm0
6047; SSE-NEXT:    por %xmm6, %xmm0
6048; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,6,7,7,7]
6049; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,1,3,2]
6050; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0]
6051; SSE-NEXT:    movdqa %xmm1, %xmm8
6052; SSE-NEXT:    pandn %xmm6, %xmm8
6053; SSE-NEXT:    pand %xmm1, %xmm0
6054; SSE-NEXT:    movdqa %xmm1, %xmm4
6055; SSE-NEXT:    por %xmm0, %xmm8
6056; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0]
6057; SSE-NEXT:    movdqa %xmm1, %xmm0
6058; SSE-NEXT:    pandn %xmm8, %xmm0
6059; SSE-NEXT:    pand %xmm1, %xmm2
6060; SSE-NEXT:    por %xmm2, %xmm0
6061; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6062; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
6063; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
6064; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm9[0,0,2,1,4,5,6,7]
6065; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
6066; SSE-NEXT:    movdqa %xmm3, %xmm2
6067; SSE-NEXT:    pandn %xmm0, %xmm2
6068; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
6069; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm11[0,0,0,0,4,5,6,7]
6070; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
6071; SSE-NEXT:    pand %xmm3, %xmm0
6072; SSE-NEXT:    por %xmm0, %xmm2
6073; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
6074; SSE-NEXT:    movdqa %xmm3, %xmm6
6075; SSE-NEXT:    pandn %xmm2, %xmm6
6076; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6077; SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
6078; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm5[0,2,1,3,4,5,6,7]
6079; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,0]
6080; SSE-NEXT:    movdqa %xmm4, %xmm0
6081; SSE-NEXT:    pandn %xmm2, %xmm0
6082; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6083; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7]
6084; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1]
6085; SSE-NEXT:    pand %xmm4, %xmm2
6086; SSE-NEXT:    por %xmm2, %xmm0
6087; SSE-NEXT:    pand %xmm3, %xmm0
6088; SSE-NEXT:    por %xmm6, %xmm0
6089; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
6090; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
6091; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7]
6092; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1]
6093; SSE-NEXT:    movdqa %xmm12, %xmm6
6094; SSE-NEXT:    pandn %xmm2, %xmm6
6095; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
6096; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm13[0,0,0,0,4,5,6,7]
6097; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
6098; SSE-NEXT:    pand %xmm12, %xmm2
6099; SSE-NEXT:    por %xmm2, %xmm6
6100; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
6101; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm14[0,0,0,0,4,5,6,7]
6102; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
6103; SSE-NEXT:    movdqa %xmm7, %xmm8
6104; SSE-NEXT:    pandn %xmm2, %xmm8
6105; SSE-NEXT:    pand %xmm7, %xmm6
6106; SSE-NEXT:    por %xmm6, %xmm8
6107; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
6108; SSE-NEXT:    movdqa %xmm2, %xmm6
6109; SSE-NEXT:    pandn %xmm8, %xmm6
6110; SSE-NEXT:    pand %xmm2, %xmm0
6111; SSE-NEXT:    por %xmm0, %xmm6
6112; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6113; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,1,2,2]
6114; SSE-NEXT:    movdqa %xmm7, %xmm2
6115; SSE-NEXT:    pandn %xmm0, %xmm2
6116; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[1,1,2,3,4,5,6,7]
6117; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
6118; SSE-NEXT:    pand %xmm7, %xmm0
6119; SSE-NEXT:    por %xmm2, %xmm0
6120; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
6121; SSE-NEXT:    movdqa %xmm12, %xmm2
6122; SSE-NEXT:    pandn %xmm0, %xmm2
6123; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[1,1,2,1]
6124; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,7,5,6,4]
6125; SSE-NEXT:    movdqa %xmm4, %xmm0
6126; SSE-NEXT:    pandn %xmm6, %xmm0
6127; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm11[1,1,2,2,4,5,6,7]
6128; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1]
6129; SSE-NEXT:    pand %xmm4, %xmm6
6130; SSE-NEXT:    por %xmm6, %xmm0
6131; SSE-NEXT:    pand %xmm12, %xmm0
6132; SSE-NEXT:    por %xmm2, %xmm0
6133; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7]
6134; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
6135; SSE-NEXT:    movdqa %xmm10, %xmm6
6136; SSE-NEXT:    pandn %xmm2, %xmm6
6137; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm13[1,1,1,1,4,5,6,7]
6138; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
6139; SSE-NEXT:    pand %xmm10, %xmm2
6140; SSE-NEXT:    por %xmm2, %xmm6
6141; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm14[1,1,1,1,4,5,6,7]
6142; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
6143; SSE-NEXT:    movdqa %xmm15, %xmm8
6144; SSE-NEXT:    pandn %xmm2, %xmm8
6145; SSE-NEXT:    pand %xmm15, %xmm6
6146; SSE-NEXT:    movdqa %xmm15, %xmm10
6147; SSE-NEXT:    por %xmm6, %xmm8
6148; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
6149; SSE-NEXT:    movdqa %xmm6, %xmm2
6150; SSE-NEXT:    pandn %xmm8, %xmm2
6151; SSE-NEXT:    pand %xmm6, %xmm0
6152; SSE-NEXT:    por %xmm0, %xmm2
6153; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6154; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3]
6155; SSE-NEXT:    movdqa %xmm7, %xmm2
6156; SSE-NEXT:    pandn %xmm0, %xmm2
6157; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm11[2,1,3,3,4,5,6,7]
6158; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
6159; SSE-NEXT:    pand %xmm7, %xmm0
6160; SSE-NEXT:    por %xmm2, %xmm0
6161; SSE-NEXT:    movdqa %xmm12, %xmm2
6162; SSE-NEXT:    pandn %xmm0, %xmm2
6163; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,6,7]
6164; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
6165; SSE-NEXT:    movdqa %xmm15, %xmm6
6166; SSE-NEXT:    pandn %xmm0, %xmm6
6167; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
6168; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
6169; SSE-NEXT:    pand %xmm15, %xmm0
6170; SSE-NEXT:    por %xmm0, %xmm6
6171; SSE-NEXT:    pand %xmm12, %xmm6
6172; SSE-NEXT:    por %xmm2, %xmm6
6173; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255]
6174; SSE-NEXT:    movdqa %xmm9, %xmm0
6175; SSE-NEXT:    pandn %xmm6, %xmm0
6176; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,5,7]
6177; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2]
6178; SSE-NEXT:    movdqa %xmm4, %xmm12
6179; SSE-NEXT:    movdqa %xmm4, %xmm6
6180; SSE-NEXT:    pandn %xmm2, %xmm6
6181; SSE-NEXT:    movdqa %xmm13, %xmm5
6182; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm13[2,2,2,3,4,5,6,7]
6183; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1]
6184; SSE-NEXT:    pand %xmm4, %xmm2
6185; SSE-NEXT:    por %xmm2, %xmm6
6186; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7]
6187; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[0,0,0,0]
6188; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
6189; SSE-NEXT:    movdqa %xmm2, %xmm4
6190; SSE-NEXT:    pandn %xmm8, %xmm4
6191; SSE-NEXT:    pand %xmm2, %xmm6
6192; SSE-NEXT:    por %xmm6, %xmm4
6193; SSE-NEXT:    pand %xmm9, %xmm4
6194; SSE-NEXT:    por %xmm0, %xmm4
6195; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6196; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
6197; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[0,1,1,3]
6198; SSE-NEXT:    movdqa %xmm2, %xmm6
6199; SSE-NEXT:    pandn %xmm0, %xmm6
6200; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,5,7]
6201; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2]
6202; SSE-NEXT:    pand %xmm2, %xmm0
6203; SSE-NEXT:    por %xmm6, %xmm0
6204; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0]
6205; SSE-NEXT:    movdqa %xmm2, %xmm6
6206; SSE-NEXT:    pandn %xmm0, %xmm6
6207; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,5,5,5]
6208; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
6209; SSE-NEXT:    movdqa %xmm7, %xmm8
6210; SSE-NEXT:    pandn %xmm0, %xmm8
6211; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
6212; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm9[1,2,2,3,4,5,6,7]
6213; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
6214; SSE-NEXT:    pand %xmm7, %xmm0
6215; SSE-NEXT:    por %xmm8, %xmm0
6216; SSE-NEXT:    pand %xmm2, %xmm0
6217; SSE-NEXT:    por %xmm6, %xmm0
6218; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
6219; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm13[1,2,2,3,4,5,6,7]
6220; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0]
6221; SSE-NEXT:    movdqa %xmm10, %xmm8
6222; SSE-NEXT:    pandn %xmm6, %xmm8
6223; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5]
6224; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3]
6225; SSE-NEXT:    pand %xmm10, %xmm6
6226; SSE-NEXT:    por %xmm6, %xmm8
6227; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,4,5,5,7]
6228; SSE-NEXT:    movdqa %xmm14, %xmm2
6229; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
6230; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255]
6231; SSE-NEXT:    movdqa %xmm3, %xmm10
6232; SSE-NEXT:    pandn %xmm6, %xmm10
6233; SSE-NEXT:    pand %xmm3, %xmm8
6234; SSE-NEXT:    movdqa %xmm3, %xmm6
6235; SSE-NEXT:    por %xmm8, %xmm10
6236; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
6237; SSE-NEXT:    movdqa %xmm3, %xmm4
6238; SSE-NEXT:    pandn %xmm10, %xmm4
6239; SSE-NEXT:    pand %xmm3, %xmm0
6240; SSE-NEXT:    por %xmm0, %xmm4
6241; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6242; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,6,5,7,7]
6243; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
6244; SSE-NEXT:    movdqa %xmm6, %xmm3
6245; SSE-NEXT:    pandn %xmm0, %xmm6
6246; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,7,7,7,7]
6247; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
6248; SSE-NEXT:    pand %xmm3, %xmm0
6249; SSE-NEXT:    movdqa %xmm3, %xmm14
6250; SSE-NEXT:    por %xmm0, %xmm6
6251; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
6252; SSE-NEXT:    movdqa %xmm3, %xmm8
6253; SSE-NEXT:    pandn %xmm6, %xmm8
6254; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,6,6,7]
6255; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[2,1,3,3]
6256; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
6257; SSE-NEXT:    movdqa %xmm4, %xmm0
6258; SSE-NEXT:    pandn %xmm6, %xmm0
6259; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,7,7,7,7]
6260; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2]
6261; SSE-NEXT:    pand %xmm4, %xmm6
6262; SSE-NEXT:    movdqa %xmm4, %xmm11
6263; SSE-NEXT:    por %xmm6, %xmm0
6264; SSE-NEXT:    pand %xmm3, %xmm0
6265; SSE-NEXT:    por %xmm8, %xmm0
6266; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,7,7,7,7]
6267; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2]
6268; SSE-NEXT:    movdqa %xmm7, %xmm8
6269; SSE-NEXT:    pandn %xmm6, %xmm8
6270; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm13[0,1,2,3,5,6,6,7]
6271; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,2,2,3]
6272; SSE-NEXT:    pand %xmm7, %xmm6
6273; SSE-NEXT:    por %xmm8, %xmm6
6274; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,6,7,7,7]
6275; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[2,1,3,2]
6276; SSE-NEXT:    movdqa %xmm12, %xmm10
6277; SSE-NEXT:    pandn %xmm8, %xmm10
6278; SSE-NEXT:    pand %xmm12, %xmm6
6279; SSE-NEXT:    movdqa %xmm12, %xmm4
6280; SSE-NEXT:    por %xmm6, %xmm10
6281; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0]
6282; SSE-NEXT:    movdqa %xmm1, %xmm2
6283; SSE-NEXT:    pandn %xmm10, %xmm2
6284; SSE-NEXT:    pand %xmm1, %xmm0
6285; SSE-NEXT:    por %xmm0, %xmm2
6286; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6287; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
6288; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
6289; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm9[0,0,2,1,4,5,6,7]
6290; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
6291; SSE-NEXT:    movdqa %xmm11, %xmm1
6292; SSE-NEXT:    movdqa %xmm11, %xmm8
6293; SSE-NEXT:    pandn %xmm0, %xmm8
6294; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
6295; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm11[0,0,0,0,4,5,6,7]
6296; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
6297; SSE-NEXT:    pand %xmm1, %xmm0
6298; SSE-NEXT:    por %xmm0, %xmm8
6299; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
6300; SSE-NEXT:    movdqa %xmm3, %xmm10
6301; SSE-NEXT:    pandn %xmm8, %xmm10
6302; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6303; SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
6304; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm5[0,2,1,3,4,5,6,7]
6305; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[0,1,1,0]
6306; SSE-NEXT:    movdqa %xmm12, %xmm0
6307; SSE-NEXT:    pandn %xmm8, %xmm0
6308; SSE-NEXT:    movdqa (%rsp), %xmm13 # 16-byte Reload
6309; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm13[0,0,2,1,4,5,6,7]
6310; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,0,2,1]
6311; SSE-NEXT:    pand %xmm12, %xmm8
6312; SSE-NEXT:    por %xmm8, %xmm0
6313; SSE-NEXT:    pand %xmm3, %xmm0
6314; SSE-NEXT:    por %xmm10, %xmm0
6315; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
6316; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
6317; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm3[0,1,1,3,4,5,6,7]
6318; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,0,2,1]
6319; SSE-NEXT:    movdqa %xmm14, %xmm10
6320; SSE-NEXT:    pandn %xmm8, %xmm10
6321; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6322; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm1[0,0,0,0,4,5,6,7]
6323; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0]
6324; SSE-NEXT:    pand %xmm14, %xmm8
6325; SSE-NEXT:    por %xmm8, %xmm10
6326; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
6327; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm6[0,0,0,0,4,5,6,7]
6328; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0]
6329; SSE-NEXT:    movdqa %xmm7, %xmm15
6330; SSE-NEXT:    pandn %xmm8, %xmm15
6331; SSE-NEXT:    pand %xmm7, %xmm10
6332; SSE-NEXT:    por %xmm10, %xmm15
6333; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
6334; SSE-NEXT:    movdqa %xmm8, %xmm10
6335; SSE-NEXT:    pandn %xmm15, %xmm10
6336; SSE-NEXT:    pand %xmm8, %xmm0
6337; SSE-NEXT:    por %xmm0, %xmm10
6338; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6339; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,1,2,2]
6340; SSE-NEXT:    movdqa %xmm7, %xmm8
6341; SSE-NEXT:    pandn %xmm0, %xmm8
6342; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm13[1,1,2,3,4,5,6,7]
6343; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
6344; SSE-NEXT:    pand %xmm7, %xmm0
6345; SSE-NEXT:    por %xmm8, %xmm0
6346; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
6347; SSE-NEXT:    movdqa %xmm12, %xmm8
6348; SSE-NEXT:    pandn %xmm0, %xmm8
6349; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[1,1,2,1]
6350; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,7,5,6,4]
6351; SSE-NEXT:    movdqa %xmm4, %xmm14
6352; SSE-NEXT:    movdqa %xmm4, %xmm0
6353; SSE-NEXT:    pandn %xmm10, %xmm0
6354; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm11[1,1,2,2,4,5,6,7]
6355; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1]
6356; SSE-NEXT:    pand %xmm4, %xmm10
6357; SSE-NEXT:    por %xmm10, %xmm0
6358; SSE-NEXT:    pand %xmm12, %xmm0
6359; SSE-NEXT:    movdqa %xmm12, %xmm4
6360; SSE-NEXT:    por %xmm8, %xmm0
6361; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm3[0,2,2,3,4,5,6,7]
6362; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,1,3]
6363; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
6364; SSE-NEXT:    movdqa %xmm2, %xmm10
6365; SSE-NEXT:    pandn %xmm8, %xmm10
6366; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm1[1,1,1,1,4,5,6,7]
6367; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0]
6368; SSE-NEXT:    pand %xmm2, %xmm8
6369; SSE-NEXT:    por %xmm8, %xmm10
6370; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm6[1,1,1,1,4,5,6,7]
6371; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0]
6372; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255]
6373; SSE-NEXT:    movdqa %xmm2, %xmm15
6374; SSE-NEXT:    pandn %xmm8, %xmm15
6375; SSE-NEXT:    pand %xmm2, %xmm10
6376; SSE-NEXT:    movdqa %xmm2, %xmm12
6377; SSE-NEXT:    por %xmm10, %xmm15
6378; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
6379; SSE-NEXT:    movdqa %xmm2, %xmm8
6380; SSE-NEXT:    pandn %xmm15, %xmm8
6381; SSE-NEXT:    pand %xmm2, %xmm0
6382; SSE-NEXT:    por %xmm0, %xmm8
6383; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6384; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3]
6385; SSE-NEXT:    movdqa %xmm7, %xmm10
6386; SSE-NEXT:    pandn %xmm0, %xmm10
6387; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm11[2,1,3,3,4,5,6,7]
6388; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
6389; SSE-NEXT:    pand %xmm7, %xmm0
6390; SSE-NEXT:    por %xmm10, %xmm0
6391; SSE-NEXT:    movdqa %xmm4, %xmm10
6392; SSE-NEXT:    pandn %xmm0, %xmm10
6393; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,6,7]
6394; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
6395; SSE-NEXT:    movdqa %xmm12, %xmm15
6396; SSE-NEXT:    pandn %xmm0, %xmm15
6397; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm13[0,2,2,3,4,5,6,7]
6398; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
6399; SSE-NEXT:    pand %xmm12, %xmm0
6400; SSE-NEXT:    por %xmm0, %xmm15
6401; SSE-NEXT:    pand %xmm4, %xmm15
6402; SSE-NEXT:    por %xmm10, %xmm15
6403; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255]
6404; SSE-NEXT:    movdqa %xmm2, %xmm10
6405; SSE-NEXT:    pandn %xmm15, %xmm10
6406; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,6,5,7]
6407; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2]
6408; SSE-NEXT:    movdqa %xmm14, %xmm15
6409; SSE-NEXT:    pandn %xmm0, %xmm15
6410; SSE-NEXT:    movdqa %xmm1, %xmm8
6411; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[2,2,2,3,4,5,6,7]
6412; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
6413; SSE-NEXT:    pand %xmm14, %xmm0
6414; SSE-NEXT:    movdqa %xmm14, %xmm4
6415; SSE-NEXT:    por %xmm0, %xmm15
6416; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm6[2,2,2,2,4,5,6,7]
6417; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
6418; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
6419; SSE-NEXT:    movdqa %xmm1, %xmm9
6420; SSE-NEXT:    pandn %xmm0, %xmm9
6421; SSE-NEXT:    pand %xmm1, %xmm15
6422; SSE-NEXT:    por %xmm15, %xmm9
6423; SSE-NEXT:    pand %xmm2, %xmm9
6424; SSE-NEXT:    por %xmm10, %xmm9
6425; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6426; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
6427; SSE-NEXT:    movdqa %xmm1, %xmm10
6428; SSE-NEXT:    pandn %xmm0, %xmm10
6429; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,5,7]
6430; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2]
6431; SSE-NEXT:    pand %xmm1, %xmm0
6432; SSE-NEXT:    por %xmm10, %xmm0
6433; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,5,5,5,5]
6434; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[2,2,2,2]
6435; SSE-NEXT:    movdqa %xmm7, %xmm15
6436; SSE-NEXT:    pandn %xmm10, %xmm15
6437; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
6438; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm3[1,2,2,3,4,5,6,7]
6439; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1]
6440; SSE-NEXT:    pand %xmm7, %xmm10
6441; SSE-NEXT:    por %xmm15, %xmm10
6442; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0]
6443; SSE-NEXT:    pand %xmm1, %xmm10
6444; SSE-NEXT:    pandn %xmm0, %xmm1
6445; SSE-NEXT:    por %xmm10, %xmm1
6446; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6447; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm5[1,2,2,3,4,5,6,7]
6448; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
6449; SSE-NEXT:    movdqa %xmm12, %xmm10
6450; SSE-NEXT:    pandn %xmm0, %xmm10
6451; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,4,6,5]
6452; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
6453; SSE-NEXT:    pand %xmm12, %xmm0
6454; SSE-NEXT:    por %xmm0, %xmm10
6455; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,5,7]
6456; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
6457; SSE-NEXT:    movdqa {{.*#+}} xmm14 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255]
6458; SSE-NEXT:    movdqa %xmm14, %xmm15
6459; SSE-NEXT:    pandn %xmm0, %xmm15
6460; SSE-NEXT:    pand %xmm14, %xmm10
6461; SSE-NEXT:    por %xmm10, %xmm15
6462; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
6463; SSE-NEXT:    pand %xmm0, %xmm1
6464; SSE-NEXT:    pandn %xmm15, %xmm0
6465; SSE-NEXT:    por %xmm1, %xmm0
6466; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6467; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,5,7,7]
6468; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
6469; SSE-NEXT:    movdqa %xmm14, %xmm10
6470; SSE-NEXT:    pandn %xmm0, %xmm10
6471; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,7,7,7,7]
6472; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
6473; SSE-NEXT:    pand %xmm14, %xmm0
6474; SSE-NEXT:    por %xmm0, %xmm10
6475; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,6,6,7]
6476; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
6477; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255]
6478; SSE-NEXT:    movdqa %xmm1, %xmm15
6479; SSE-NEXT:    pandn %xmm0, %xmm15
6480; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,7,7,7,7]
6481; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
6482; SSE-NEXT:    pand %xmm1, %xmm0
6483; SSE-NEXT:    movdqa %xmm1, %xmm2
6484; SSE-NEXT:    por %xmm0, %xmm15
6485; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
6486; SSE-NEXT:    pand %xmm0, %xmm15
6487; SSE-NEXT:    pandn %xmm10, %xmm0
6488; SSE-NEXT:    por %xmm15, %xmm0
6489; SSE-NEXT:    movdqa %xmm0, %xmm1
6490; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,7,7,7,7]
6491; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
6492; SSE-NEXT:    movdqa %xmm7, %xmm10
6493; SSE-NEXT:    pandn %xmm0, %xmm10
6494; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,6,7]
6495; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3]
6496; SSE-NEXT:    pand %xmm7, %xmm0
6497; SSE-NEXT:    por %xmm10, %xmm0
6498; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm6[0,1,2,3,6,7,7,7]
6499; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[2,1,3,2]
6500; SSE-NEXT:    movdqa %xmm4, %xmm14
6501; SSE-NEXT:    movdqa %xmm4, %xmm15
6502; SSE-NEXT:    pandn %xmm10, %xmm15
6503; SSE-NEXT:    pand %xmm4, %xmm0
6504; SSE-NEXT:    por %xmm0, %xmm15
6505; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0]
6506; SSE-NEXT:    pand %xmm0, %xmm1
6507; SSE-NEXT:    pandn %xmm15, %xmm0
6508; SSE-NEXT:    por %xmm1, %xmm0
6509; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
6510; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
6511; SSE-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
6512; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm12[0,0,2,1,4,5,6,7]
6513; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
6514; SSE-NEXT:    movdqa %xmm2, %xmm10
6515; SSE-NEXT:    pandn %xmm0, %xmm10
6516; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
6517; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm11[0,0,0,0,4,5,6,7]
6518; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
6519; SSE-NEXT:    pand %xmm2, %xmm0
6520; SSE-NEXT:    movdqa %xmm2, %xmm6
6521; SSE-NEXT:    por %xmm0, %xmm10
6522; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
6523; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
6524; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm8[0,2,1,3,4,5,6,7]
6525; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
6526; SSE-NEXT:    movdqa %xmm4, %xmm15
6527; SSE-NEXT:    pandn %xmm0, %xmm15
6528; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
6529; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm13[0,0,2,1,4,5,6,7]
6530; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
6531; SSE-NEXT:    pand %xmm4, %xmm0
6532; SSE-NEXT:    por %xmm0, %xmm15
6533; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
6534; SSE-NEXT:    pand %xmm0, %xmm15
6535; SSE-NEXT:    pandn %xmm10, %xmm0
6536; SSE-NEXT:    por %xmm15, %xmm0
6537; SSE-NEXT:    movdqa %xmm0, %xmm3
6538; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6539; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7]
6540; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
6541; SSE-NEXT:    movdqa {{.*#+}} xmm15 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255]
6542; SSE-NEXT:    pand %xmm15, %xmm0
6543; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6544; SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
6545; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm5[0,1,1,3,4,5,6,7]
6546; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1]
6547; SSE-NEXT:    pandn %xmm10, %xmm15
6548; SSE-NEXT:    por %xmm0, %xmm15
6549; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6550; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
6551; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
6552; SSE-NEXT:    movdqa %xmm7, %xmm10
6553; SSE-NEXT:    pandn %xmm0, %xmm10
6554; SSE-NEXT:    pand %xmm7, %xmm15
6555; SSE-NEXT:    por %xmm15, %xmm10
6556; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
6557; SSE-NEXT:    pand %xmm0, %xmm3
6558; SSE-NEXT:    pandn %xmm10, %xmm0
6559; SSE-NEXT:    por %xmm3, %xmm0
6560; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6561; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[0,1,2,2]
6562; SSE-NEXT:    movdqa %xmm7, %xmm3
6563; SSE-NEXT:    pandn %xmm0, %xmm3
6564; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm13[1,1,2,3,4,5,6,7]
6565; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
6566; SSE-NEXT:    pand %xmm7, %xmm0
6567; SSE-NEXT:    por %xmm3, %xmm0
6568; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
6569; SSE-NEXT:    movdqa %xmm4, %xmm10
6570; SSE-NEXT:    pandn %xmm0, %xmm10
6571; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[1,1,2,1]
6572; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
6573; SSE-NEXT:    movdqa %xmm14, %xmm3
6574; SSE-NEXT:    pandn %xmm0, %xmm3
6575; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm11[1,1,2,2,4,5,6,7]
6576; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
6577; SSE-NEXT:    pand %xmm14, %xmm0
6578; SSE-NEXT:    por %xmm0, %xmm3
6579; SSE-NEXT:    pand %xmm4, %xmm3
6580; SSE-NEXT:    por %xmm10, %xmm3
6581; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[1,1,1,1,4,5,6,7]
6582; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
6583; SSE-NEXT:    movdqa %xmm6, %xmm4
6584; SSE-NEXT:    pand %xmm6, %xmm0
6585; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm5[0,2,2,3,4,5,6,7]
6586; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,1,1,3]
6587; SSE-NEXT:    pandn %xmm10, %xmm4
6588; SSE-NEXT:    por %xmm0, %xmm4
6589; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,1,4,5,6,7]
6590; SSE-NEXT:    movdqa %xmm1, %xmm15
6591; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
6592; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255]
6593; SSE-NEXT:    movdqa %xmm6, %xmm10
6594; SSE-NEXT:    pandn %xmm0, %xmm10
6595; SSE-NEXT:    pand %xmm6, %xmm4
6596; SSE-NEXT:    por %xmm4, %xmm10
6597; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
6598; SSE-NEXT:    pand %xmm1, %xmm3
6599; SSE-NEXT:    pandn %xmm10, %xmm1
6600; SSE-NEXT:    por %xmm3, %xmm1
6601; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3]
6602; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm11[2,1,3,3,4,5,6,7]
6603; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1]
6604; SSE-NEXT:    pand %xmm7, %xmm3
6605; SSE-NEXT:    pandn %xmm0, %xmm7
6606; SSE-NEXT:    por %xmm3, %xmm7
6607; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm13[0,2,2,3,4,5,6,7]
6608; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
6609; SSE-NEXT:    pand %xmm6, %xmm0
6610; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,5,6,6,7]
6611; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
6612; SSE-NEXT:    pandn %xmm3, %xmm6
6613; SSE-NEXT:    por %xmm0, %xmm6
6614; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
6615; SSE-NEXT:    pand %xmm0, %xmm6
6616; SSE-NEXT:    pandn %xmm7, %xmm0
6617; SSE-NEXT:    por %xmm6, %xmm0
6618; SSE-NEXT:    movdqa %xmm0, %xmm4
6619; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[2,2,2,3,4,5,6,7]
6620; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
6621; SSE-NEXT:    pand %xmm14, %xmm0
6622; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,5,7]
6623; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,1,3,2]
6624; SSE-NEXT:    pandn %xmm3, %xmm14
6625; SSE-NEXT:    por %xmm0, %xmm14
6626; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255]
6627; SSE-NEXT:    pand %xmm3, %xmm14
6628; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm15[2,2,2,2,4,5,6,7]
6629; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
6630; SSE-NEXT:    pandn %xmm0, %xmm3
6631; SSE-NEXT:    por %xmm14, %xmm3
6632; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255]
6633; SSE-NEXT:    pand %xmm0, %xmm3
6634; SSE-NEXT:    pandn %xmm4, %xmm0
6635; SSE-NEXT:    por %xmm0, %xmm3
6636; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6637; SSE-NEXT:    movdqa %xmm3, 368(%rax)
6638; SSE-NEXT:    movdqa %xmm1, 352(%rax)
6639; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6640; SSE-NEXT:    movaps %xmm0, 336(%rax)
6641; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
6642; SSE-NEXT:    movaps %xmm0, 320(%rax)
6643; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6644; SSE-NEXT:    movaps %xmm0, 288(%rax)
6645; SSE-NEXT:    movdqa %xmm9, 256(%rax)
6646; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6647; SSE-NEXT:    movaps %xmm0, 240(%rax)
6648; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6649; SSE-NEXT:    movaps %xmm0, 224(%rax)
6650; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6651; SSE-NEXT:    movaps %xmm0, 208(%rax)
6652; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6653; SSE-NEXT:    movaps %xmm0, 176(%rax)
6654; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6655; SSE-NEXT:    movaps %xmm0, 144(%rax)
6656; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6657; SSE-NEXT:    movaps %xmm0, 128(%rax)
6658; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6659; SSE-NEXT:    movaps %xmm0, 112(%rax)
6660; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6661; SSE-NEXT:    movaps %xmm0, 96(%rax)
6662; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6663; SSE-NEXT:    movaps %xmm0, 64(%rax)
6664; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6665; SSE-NEXT:    movaps %xmm0, 32(%rax)
6666; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6667; SSE-NEXT:    movaps %xmm0, 16(%rax)
6668; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6669; SSE-NEXT:    movaps %xmm0, (%rax)
6670; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6671; SSE-NEXT:    movaps %xmm0, 304(%rax)
6672; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6673; SSE-NEXT:    movaps %xmm0, 192(%rax)
6674; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6675; SSE-NEXT:    movaps %xmm0, 80(%rax)
6676; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6677; SSE-NEXT:    movaps %xmm0, 272(%rax)
6678; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6679; SSE-NEXT:    movaps %xmm0, 160(%rax)
6680; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6681; SSE-NEXT:    movaps %xmm0, 48(%rax)
6682; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6683; SSE-NEXT:    movaps %xmm0, 432(%rax)
6684; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6685; SSE-NEXT:    movaps %xmm0, 400(%rax)
6686; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6687; SSE-NEXT:    movaps %xmm0, 416(%rax)
6688; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6689; SSE-NEXT:    movaps %xmm0, 384(%rax)
6690; SSE-NEXT:    addq $648, %rsp # imm = 0x288
6691; SSE-NEXT:    retq
6692;
6693; AVX-LABEL: store_i8_stride7_vf64:
6694; AVX:       # %bb.0:
6695; AVX-NEXT:    subq $616, %rsp # imm = 0x268
6696; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6697; AVX-NEXT:    vmovdqa 16(%rax), %xmm6
6698; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm6[13,u,u,u,u],zero,zero,xmm6[14,u,u,u,u],zero,zero,xmm6[15]
6699; AVX-NEXT:    vmovdqa 16(%r8), %xmm10
6700; AVX-NEXT:    vmovdqa 16(%r9), %xmm8
6701; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15]
6702; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero
6703; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm1
6704; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm6[u],zero,zero,xmm6[11,u,u,u,u],zero,zero,xmm6[12,u,u,u,u],zero
6705; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
6706; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6707; AVX-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6708; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[u,6,7],zero,xmm0[u,u,u,u,8,9],zero,xmm0[u,u,u,u,10]
6709; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
6710; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
6711; AVX-NEXT:    vmovdqa 16(%rsi), %xmm7
6712; AVX-NEXT:    vmovdqa 16(%rdi), %xmm11
6713; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm7[8],xmm11[9],xmm7[9],xmm11[10],xmm7[10],xmm11[11],xmm7[11],xmm11[12],xmm7[12],xmm11[13],xmm7[13],xmm11[14],xmm7[14],xmm11[15],xmm7[15]
6714; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u]
6715; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u]
6716; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm3
6717; AVX-NEXT:    vmovdqa 16(%rcx), %xmm12
6718; AVX-NEXT:    vmovdqa 16(%rdx), %xmm13
6719; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15]
6720; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u]
6721; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
6722; AVX-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6723; AVX-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6724; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u]
6725; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
6726; AVX-NEXT:    vmovaps {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
6727; AVX-NEXT:    vandnps %ymm3, %ymm5, %ymm3
6728; AVX-NEXT:    vandps %ymm5, %ymm4, %ymm4
6729; AVX-NEXT:    vorps %ymm3, %ymm4, %ymm3
6730; AVX-NEXT:    vmovaps {{.*#+}} ymm4 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0]
6731; AVX-NEXT:    vandnps %ymm1, %ymm4, %ymm1
6732; AVX-NEXT:    vandps %ymm4, %ymm3, %ymm3
6733; AVX-NEXT:    vorps %ymm1, %ymm3, %ymm1
6734; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6735; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,3],zero,xmm0[u,u,u,u,4,5],zero,xmm0[u,u,u]
6736; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6737; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm6[u,u,u],zero,zero,xmm6[9,u,u,u,u],zero,zero,xmm6[10,u,u,u]
6738; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
6739; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,128,7,u,u,u,u,u,128,8,u,u]
6740; AVX-NEXT:    vpshufb %xmm3, %xmm8, %xmm1
6741; AVX-NEXT:    vmovdqa %xmm3, %xmm8
6742; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,7,128,u,u,u,u,u,8,128,u,u]
6743; AVX-NEXT:    vpshufb %xmm4, %xmm10, %xmm3
6744; AVX-NEXT:    vmovdqa %xmm4, %xmm10
6745; AVX-NEXT:    vpor %xmm1, %xmm3, %xmm1
6746; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u]
6747; AVX-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
6748; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u]
6749; AVX-NEXT:    vpshufb %xmm4, %xmm6, %xmm3
6750; AVX-NEXT:    vpor %xmm3, %xmm1, %xmm1
6751; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
6752; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,128,7,u,u,u,u,u,128,8,u,u,u,u]
6753; AVX-NEXT:    vpshufb %xmm3, %xmm12, %xmm1
6754; AVX-NEXT:    vmovdqa %xmm3, %xmm12
6755; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,7,128,u,u,u,u,u,8,128,u,u,u,u]
6756; AVX-NEXT:    vpshufb %xmm9, %xmm13, %xmm3
6757; AVX-NEXT:    vpor %xmm1, %xmm3, %xmm1
6758; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6]
6759; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
6760; AVX-NEXT:    vmovdqa {{.*#+}} xmm14 = [u,128,7,u,u,u,u,u,128,8,u,u,u,u,u,128]
6761; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6762; AVX-NEXT:    vpshufb %xmm14, %xmm7, %xmm2
6763; AVX-NEXT:    vmovdqa {{.*#+}} xmm13 = [u,7,128,u,u,u,u,u,8,128,u,u,u,u,u,9]
6764; AVX-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6765; AVX-NEXT:    vpshufb %xmm13, %xmm11, %xmm3
6766; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
6767; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm11[8],xmm7[9],xmm11[9],xmm7[10],xmm11[10],xmm7[11],xmm11[11],xmm7[12],xmm11[12],xmm7[13],xmm11[13],xmm7[14],xmm11[14],xmm7[15],xmm11[15]
6768; AVX-NEXT:    vmovdqa {{.*#+}} xmm15 = [2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u]
6769; AVX-NEXT:    vpshufb %xmm15, %xmm3, %xmm3
6770; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
6771; AVX-NEXT:    vmovaps {{.*#+}} ymm3 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0]
6772; AVX-NEXT:    vandnps %ymm1, %ymm3, %ymm1
6773; AVX-NEXT:    vandps %ymm3, %ymm2, %ymm2
6774; AVX-NEXT:    vorps %ymm1, %ymm2, %ymm1
6775; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
6776; AVX-NEXT:    vandnps %ymm0, %ymm2, %ymm0
6777; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
6778; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
6779; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6780; AVX-NEXT:    vmovdqa 32(%r8), %xmm3
6781; AVX-NEXT:    vmovdqa 32(%r9), %xmm11
6782; AVX-NEXT:    vpshufb %xmm8, %xmm11, %xmm0
6783; AVX-NEXT:    vpshufb %xmm10, %xmm3, %xmm1
6784; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
6785; AVX-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
6786; AVX-NEXT:    vmovdqa 32(%rax), %xmm8
6787; AVX-NEXT:    vpshufb %xmm4, %xmm8, %xmm1
6788; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
6789; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm8[4,u,u,u,u],zero,zero,xmm8[5,u,u,u,u],zero,zero
6790; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
6791; AVX-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6792; AVX-NEXT:    vmovdqa %xmm3, %xmm10
6793; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6794; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6795; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9],zero,xmm2[u,u,u,u,10,11],zero,xmm2[u,u,u,u,12,13]
6796; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
6797; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm4
6798; AVX-NEXT:    vmovdqa 32(%rcx), %xmm0
6799; AVX-NEXT:    vmovdqa 32(%rdx), %xmm2
6800; AVX-NEXT:    vpshufb %xmm12, %xmm0, %xmm1
6801; AVX-NEXT:    vpshufb %xmm9, %xmm2, %xmm3
6802; AVX-NEXT:    vpor %xmm1, %xmm3, %xmm1
6803; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
6804; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6805; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u]
6806; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm5
6807; AVX-NEXT:    vmovdqa 32(%rsi), %xmm1
6808; AVX-NEXT:    vmovdqa 32(%rdi), %xmm3
6809; AVX-NEXT:    vpshufb %xmm14, %xmm1, %xmm6
6810; AVX-NEXT:    vpshufb %xmm13, %xmm3, %xmm7
6811; AVX-NEXT:    vpor %xmm6, %xmm7, %xmm6
6812; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
6813; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6814; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u]
6815; AVX-NEXT:    vpshufb %xmm9, %xmm7, %xmm7
6816; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm6
6817; AVX-NEXT:    vmovaps {{.*#+}} ymm7 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
6818; AVX-NEXT:    vandnps %ymm5, %ymm7, %ymm5
6819; AVX-NEXT:    vandps %ymm7, %ymm6, %ymm6
6820; AVX-NEXT:    vorps %ymm5, %ymm6, %ymm5
6821; AVX-NEXT:    vmovaps {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
6822; AVX-NEXT:    vandnps %ymm4, %ymm6, %ymm4
6823; AVX-NEXT:    vandps %ymm6, %ymm5, %ymm5
6824; AVX-NEXT:    vorps %ymm4, %ymm5, %ymm4
6825; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6826; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6827; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm8[u],zero,zero,xmm8[11,u,u,u,u],zero,zero,xmm8[12,u,u,u,u],zero
6828; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15]
6829; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[u,6,7],zero,xmm5[u,u,u,u,8,9],zero,xmm5[u,u,u,u,10]
6830; AVX-NEXT:    vpor %xmm4, %xmm6, %xmm4
6831; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,2,3],zero,xmm5[u,u,u,u,4,5],zero,xmm5[u,u,u]
6832; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm8[u,u,u],zero,zero,xmm8[9,u,u,u,u],zero,zero,xmm8[10,u,u,u]
6833; AVX-NEXT:    vpor %xmm6, %xmm5, %xmm5
6834; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
6835; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
6836; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6837; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
6838; AVX-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
6839; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u]
6840; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
6841; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
6842; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
6843; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6844; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u]
6845; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
6846; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6]
6847; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
6848; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
6849; AVX-NEXT:    vandnps %ymm1, %ymm2, %ymm1
6850; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
6851; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
6852; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
6853; AVX-NEXT:    vandnps %ymm4, %ymm2, %ymm1
6854; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
6855; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
6856; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6857; AVX-NEXT:    vmovdqa 48(%rax), %xmm12
6858; AVX-NEXT:    vmovdqa 48(%r8), %xmm2
6859; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6860; AVX-NEXT:    vmovdqa 48(%r9), %xmm1
6861; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6862; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128]
6863; AVX-NEXT:    vpshufb %xmm5, %xmm12, %xmm0
6864; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
6865; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6866; AVX-NEXT:    vmovdqa {{.*#+}} xmm14 = [8,9,128,u,u,u,u,10,11,128,u,u,u,u,12,13]
6867; AVX-NEXT:    vpshufb %xmm14, %xmm2, %xmm1
6868; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
6869; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm12[u,u],zero,zero,xmm12[2,u,u,u,u],zero,zero,xmm12[3,u,u,u,u]
6870; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[u,u,4,5],zero,xmm2[u,u,u,u,6,7],zero,xmm2[u,u,u,u]
6871; AVX-NEXT:    vpor %xmm1, %xmm3, %xmm1
6872; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
6873; AVX-NEXT:    vmovdqa 48(%rsi), %xmm1
6874; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6875; AVX-NEXT:    vmovdqa 48(%rdi), %xmm10
6876; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
6877; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6878; AVX-NEXT:    vpshufb %xmm9, %xmm2, %xmm1
6879; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u]
6880; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
6881; AVX-NEXT:    vmovdqa 48(%rcx), %xmm2
6882; AVX-NEXT:    vmovdqa %xmm2, (%rsp) # 16-byte Spill
6883; AVX-NEXT:    vmovdqa 48(%rdx), %xmm13
6884; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3],xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7]
6885; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6886; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u]
6887; AVX-NEXT:    vpshufb %xmm9, %xmm4, %xmm3
6888; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm4[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9]
6889; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm11, %ymm3
6890; AVX-NEXT:    vandnps %ymm1, %ymm7, %ymm1
6891; AVX-NEXT:    vandps %ymm7, %ymm3, %ymm3
6892; AVX-NEXT:    vorps %ymm1, %ymm3, %ymm1
6893; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0]
6894; AVX-NEXT:    vandnps %ymm0, %ymm2, %ymm0
6895; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
6896; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
6897; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6898; AVX-NEXT:    vmovdqa (%r9), %xmm6
6899; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm6[u,u,u,u,u],zero,xmm6[7,u,u,u,u,u],zero,xmm6[8,u,u]
6900; AVX-NEXT:    vmovdqa (%r8), %xmm8
6901; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,7],zero,xmm8[u,u,u,u,u,8],zero,xmm8[u,u]
6902; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
6903; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[u,u,u,u,5,6],zero,xmm0[u,u,u,u,12,13],zero,xmm0[u]
6904; AVX-NEXT:    vmovdqa (%rax), %xmm7
6905; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm7[6,u,u,u,u],zero,zero,xmm7[7,u,u,u,u],zero,zero,xmm7[8,u]
6906; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
6907; AVX-NEXT:    vpshufb %xmm5, %xmm7, %xmm1
6908; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
6909; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6910; AVX-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6911; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6912; AVX-NEXT:    vpshufb %xmm14, %xmm4, %xmm15
6913; AVX-NEXT:    vpor %xmm1, %xmm15, %xmm1
6914; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm15
6915; AVX-NEXT:    vmovdqa (%rcx), %xmm5
6916; AVX-NEXT:    vmovdqa (%rdx), %xmm3
6917; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm5[u,u,u],zero,xmm5[7,u,u,u,u,u],zero,xmm5[8,u,u,u,u]
6918; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u]
6919; AVX-NEXT:    vpor %xmm0, %xmm14, %xmm0
6920; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
6921; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6922; AVX-NEXT:    vpshufb %xmm9, %xmm1, %xmm14
6923; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm14, %ymm4
6924; AVX-NEXT:    vmovdqa (%rsi), %xmm14
6925; AVX-NEXT:    vmovdqa (%rdi), %xmm2
6926; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm14[u],zero,xmm14[7,u,u,u,u,u],zero,xmm14[8,u,u,u,u,u],zero
6927; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm2[u,7],zero,xmm2[u,u,u,u,u,8],zero,xmm2[u,u,u,u,u,9]
6928; AVX-NEXT:    vpor %xmm1, %xmm11, %xmm1
6929; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
6930; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6931; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u]
6932; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm11, %ymm1
6933; AVX-NEXT:    vmovaps {{.*#+}} ymm9 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
6934; AVX-NEXT:    vandnps %ymm4, %ymm9, %ymm4
6935; AVX-NEXT:    vandps %ymm1, %ymm9, %ymm1
6936; AVX-NEXT:    vorps %ymm4, %ymm1, %ymm1
6937; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
6938; AVX-NEXT:    vandnps %ymm15, %ymm0, %ymm4
6939; AVX-NEXT:    vandps %ymm0, %ymm1, %ymm1
6940; AVX-NEXT:    vorps %ymm4, %ymm1, %ymm1
6941; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6942; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6943; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm7[u],zero,zero,xmm7[11,u,u,u,u],zero,zero,xmm7[12,u,u,u,u],zero
6944; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
6945; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm4[u,6,7],zero,xmm4[u,u,u,u,8,9],zero,xmm4[u,u,u,u,10]
6946; AVX-NEXT:    vpor %xmm1, %xmm11, %xmm1
6947; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,2,3],zero,xmm4[u,u,u,u,4,5],zero,xmm4[u,u,u]
6948; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm7[u,u,u],zero,zero,xmm7[9,u,u,u,u],zero,zero,xmm7[10,u,u,u]
6949; AVX-NEXT:    vpor %xmm4, %xmm11, %xmm4
6950; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1
6951; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15]
6952; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6953; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15]
6954; AVX-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u]
6955; AVX-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
6956; AVX-NEXT:    vmovdqa {{.*#+}} xmm14 = [u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u]
6957; AVX-NEXT:    vpshufb %xmm14, %xmm4, %xmm4
6958; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
6959; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
6960; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
6961; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6962; AVX-NEXT:    vmovdqa {{.*#+}} xmm7 = [6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u]
6963; AVX-NEXT:    vpshufb %xmm7, %xmm4, %xmm2
6964; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6]
6965; AVX-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
6966; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
6967; AVX-NEXT:    vmovaps {{.*#+}} ymm3 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
6968; AVX-NEXT:    vandnps %ymm0, %ymm3, %ymm0
6969; AVX-NEXT:    vandps %ymm3, %ymm2, %ymm2
6970; AVX-NEXT:    vorps %ymm0, %ymm2, %ymm0
6971; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
6972; AVX-NEXT:    vandnps %ymm1, %ymm2, %ymm1
6973; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
6974; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
6975; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6976; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6977; AVX-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
6978; AVX-NEXT:    # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
6979; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6980; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
6981; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm3[4,u,u,u,u],zero,zero,xmm3[5,u,u,u,u],zero,zero
6982; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[8,9],zero,xmm1[u,u,u,u,10,11],zero,xmm1[u,u,u,u,12,13]
6983; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
6984; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[u,u],zero,zero,xmm3[2,u,u,u,u],zero,zero,xmm3[3,u,u,u,u]
6985; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[u,u,4,5],zero,xmm1[u,u,u,u,6,7],zero,xmm1[u,u,u,u]
6986; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
6987; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
6988; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6989; AVX-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
6990; AVX-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
6991; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6992; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u]
6993; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u]
6994; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
6995; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6996; AVX-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
6997; AVX-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
6998; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6999; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u]
7000; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9]
7001; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm11
7002; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm11, %ymm4
7003; AVX-NEXT:    vandnps %ymm3, %ymm9, %ymm3
7004; AVX-NEXT:    vandps %ymm4, %ymm9, %ymm4
7005; AVX-NEXT:    vorps %ymm3, %ymm4, %ymm3
7006; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0]
7007; AVX-NEXT:    vandnps %ymm0, %ymm1, %ymm0
7008; AVX-NEXT:    vandps %ymm1, %ymm3, %ymm3
7009; AVX-NEXT:    vorps %ymm0, %ymm3, %ymm0
7010; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7011; AVX-NEXT:    vmovdqa (%rsp), %xmm9 # 16-byte Reload
7012; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm9[u,u,u],zero,xmm9[7,u,u,u,u,u],zero,xmm9[8,u,u,u,u]
7013; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,7],zero,xmm13[u,u,u,u,u,8],zero,xmm13[u,u,u,u]
7014; AVX-NEXT:    vpor %xmm0, %xmm3, %xmm3
7015; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15]
7016; AVX-NEXT:    vpshufb %xmm5, %xmm0, %xmm4
7017; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
7018; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
7019; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm8[u],zero,xmm8[7,u,u,u,u,u],zero,xmm8[8,u,u,u,u,u],zero
7020; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[u,7],zero,xmm10[u,u,u,u,u,8],zero,xmm10[u,u,u,u,u,9]
7021; AVX-NEXT:    vpor %xmm4, %xmm11, %xmm4
7022; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15]
7023; AVX-NEXT:    vpshufb %xmm6, %xmm11, %xmm11
7024; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm4, %ymm4
7025; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0]
7026; AVX-NEXT:    vandnps %ymm3, %ymm1, %ymm3
7027; AVX-NEXT:    vandps %ymm1, %ymm4, %ymm4
7028; AVX-NEXT:    vorps %ymm3, %ymm4, %ymm3
7029; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm4
7030; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[u,u,6,7,8,9],zero,xmm4[u,u,13,14,15]
7031; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
7032; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm6[9,u,u],zero,zero,zero,zero,xmm6[10,u,u],zero,zero,zero
7033; AVX-NEXT:    vpor %xmm4, %xmm11, %xmm4
7034; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3],zero,xmm4[u,6,7,8,9,10],zero,xmm4[u,13,14,15]
7035; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7036; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,xmm5[9,u],zero,zero,zero,zero,zero,xmm5[10,u],zero,zero,zero
7037; AVX-NEXT:    vpor %xmm4, %xmm11, %xmm4
7038; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4],zero,xmm4[6,7,8,9,10,11],zero,xmm4[13,14,15]
7039; AVX-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,xmm12[9],zero,zero,zero,zero,zero,zero,xmm12[10],zero,zero,zero
7040; AVX-NEXT:    vpor %xmm4, %xmm11, %xmm1
7041; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7042; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,1,2,3,4],zero,xmm3[u,u,8,9,10,11],zero,xmm3[u,u,15]
7043; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm6[u],zero,zero,zero,zero,xmm6[7,u,u],zero,zero,zero,zero,xmm6[8,u,u],zero
7044; AVX-NEXT:    vpor %xmm4, %xmm3, %xmm3
7045; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,1,2,3,4,5],zero,xmm3[u,8,9,10,11,12],zero,xmm3[u,15]
7046; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[u],zero,zero,zero,zero,zero,xmm5[7,u],zero,zero,zero,zero,zero,xmm5[8,u],zero
7047; AVX-NEXT:    vpor %xmm4, %xmm3, %xmm3
7048; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,xmm3[1,2,3,4,5,6],zero,xmm3[8,9,10,11,12,13],zero,xmm3[15]
7049; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm12[6],zero,zero,zero,zero,zero,zero,xmm12[7],zero,zero,zero,zero,zero,zero,xmm12[8],zero
7050; AVX-NEXT:    vpor %xmm4, %xmm3, %xmm1
7051; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7052; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
7053; AVX-NEXT:    vpshufb %xmm14, %xmm3, %xmm4
7054; AVX-NEXT:    vpmovsxdq {{.*#+}} xmm10 = [218890240,986624]
7055; AVX-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
7056; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
7057; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm13[8],xmm9[9],xmm13[9],xmm9[10],xmm13[10],xmm9[11],xmm13[11],xmm9[12],xmm13[12],xmm9[13],xmm13[13],xmm9[14],xmm13[14],xmm9[15],xmm13[15]
7058; AVX-NEXT:    vpshufb %xmm7, %xmm4, %xmm4
7059; AVX-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u]
7060; AVX-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
7061; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
7062; AVX-NEXT:    vmovaps {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
7063; AVX-NEXT:    vandnps %ymm3, %ymm8, %ymm3
7064; AVX-NEXT:    vandps %ymm0, %ymm8, %ymm0
7065; AVX-NEXT:    vorps %ymm3, %ymm0, %ymm0
7066; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0],zero,xmm0[u,u,4,5,6,7],zero,xmm0[u,u,11,12,13,14],zero
7067; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = zero,xmm6[11,u,u],zero,zero,zero,zero,xmm6[12,u,u],zero,zero,zero,zero,xmm6[13]
7068; AVX-NEXT:    vpor %xmm4, %xmm3, %xmm3
7069; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1],zero,xmm3[u,4,5,6,7,8],zero,xmm3[u,11,12,13,14,15]
7070; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm5[11,u],zero,zero,zero,zero,zero,xmm5[12,u],zero,zero,zero,zero,zero
7071; AVX-NEXT:    vpor %xmm4, %xmm3, %xmm3
7072; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,2],zero,xmm3[4,5,6,7,8,9],zero,xmm3[11,12,13,14,15]
7073; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm12[11],zero,zero,zero,zero,zero,zero,xmm12[12],zero,zero,zero,zero,zero
7074; AVX-NEXT:    vpor %xmm4, %xmm3, %xmm1
7075; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7076; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
7077; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm6[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
7078; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[u,4,5,6,7,0],zero,xmm0[u,11,12,13,14,1],zero,xmm0[u]
7079; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm5[13,u],zero,zero,zero,zero,zero,xmm5[14,u],zero,zero,zero,zero,zero,xmm5[15,u]
7080; AVX-NEXT:    vpor %xmm3, %xmm0, %xmm0
7081; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2,3,4,5,6,7],zero,xmm0[9,10,11,12,13,14],zero
7082; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,xmm12[13],zero,zero,zero,zero,zero,zero,xmm12[14],zero,zero,zero,zero,zero,zero,xmm12[15]
7083; AVX-NEXT:    vpor %xmm3, %xmm0, %xmm0
7084; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7085; AVX-NEXT:    vpmovsxdq {{.*#+}} xmm15 = [16777216,197120]
7086; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7087; AVX-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
7088; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7089; AVX-NEXT:    vpshufb %xmm9, %xmm1, %xmm3
7090; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
7091; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5]
7092; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7093; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm4
7094; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7095; AVX-NEXT:    vpshufb %xmm10, %xmm1, %xmm7
7096; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm7, %ymm4
7097; AVX-NEXT:    vandnps %ymm0, %ymm8, %ymm0
7098; AVX-NEXT:    vandps %ymm4, %ymm8, %ymm4
7099; AVX-NEXT:    vorps %ymm0, %ymm4, %ymm0
7100; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7101; AVX-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
7102; AVX-NEXT:    # xmm4 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15]
7103; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[10],zero,xmm4[u,u,u,u,13,12],zero,xmm4[u,u,u,u,15,14],zero
7104; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
7105; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = zero,xmm14[13,u,u,u,u],zero,zero,xmm14[14,u,u,u,u],zero,zero,xmm14[15]
7106; AVX-NEXT:    vpor %xmm7, %xmm4, %xmm7
7107; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u]
7108; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7109; AVX-NEXT:    vpshufb %xmm4, %xmm1, %xmm10
7110; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm10[4,5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3]
7111; AVX-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u]
7112; AVX-NEXT:    vpshufb %xmm11, %xmm5, %xmm5
7113; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm7, %ymm5
7114; AVX-NEXT:    vmovaps {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
7115; AVX-NEXT:    vandps %ymm7, %ymm0, %ymm0
7116; AVX-NEXT:    vandnps %ymm5, %ymm7, %ymm5
7117; AVX-NEXT:    vorps %ymm5, %ymm0, %ymm0
7118; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7119; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
7120; AVX-NEXT:    vpshufb %xmm2, %xmm8, %xmm5
7121; AVX-NEXT:    vpshufb %xmm15, %xmm8, %xmm10
7122; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm10, %ymm5
7123; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
7124; AVX-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u]
7125; AVX-NEXT:    vpshufb %xmm6, %xmm8, %xmm10
7126; AVX-NEXT:    vpshufb %xmm3, %xmm8, %xmm12
7127; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm12, %ymm10
7128; AVX-NEXT:    vmovaps {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0]
7129; AVX-NEXT:    vandnps %ymm5, %ymm12, %ymm5
7130; AVX-NEXT:    vandps %ymm12, %ymm10, %ymm10
7131; AVX-NEXT:    vorps %ymm5, %ymm10, %ymm5
7132; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,128,128,2,u,u,u,u,128,128,3,u,u,u,u]
7133; AVX-NEXT:    vpshufb %xmm0, %xmm14, %xmm10
7134; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
7135; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,4,5,128,u,u,u,u,6,7,128,u,u,u,u]
7136; AVX-NEXT:    vpshufb %xmm1, %xmm8, %xmm13
7137; AVX-NEXT:    vpor %xmm10, %xmm13, %xmm10
7138; AVX-NEXT:    vpshufb %xmm4, %xmm8, %xmm13
7139; AVX-NEXT:    vpalignr {{.*#+}} xmm13 = xmm13[4,5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3]
7140; AVX-NEXT:    vpshufb %xmm11, %xmm13, %xmm13
7141; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm13, %ymm10
7142; AVX-NEXT:    vmovaps {{.*#+}} ymm13 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
7143; AVX-NEXT:    vandps %ymm5, %ymm13, %ymm5
7144; AVX-NEXT:    vandnps %ymm10, %ymm13, %ymm10
7145; AVX-NEXT:    vorps %ymm5, %ymm10, %ymm10
7146; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
7147; AVX-NEXT:    vpshufb %xmm2, %xmm8, %xmm5
7148; AVX-NEXT:    vpshufb %xmm15, %xmm8, %xmm14
7149; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm14, %ymm5
7150; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7151; AVX-NEXT:    vpshufb %xmm6, %xmm2, %xmm14
7152; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm8
7153; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm8, %ymm8
7154; AVX-NEXT:    vandnps %ymm5, %ymm12, %ymm5
7155; AVX-NEXT:    vandps %ymm12, %ymm8, %ymm8
7156; AVX-NEXT:    vorps %ymm5, %ymm8, %ymm5
7157; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
7158; AVX-NEXT:    vpshufb %xmm0, %xmm6, %xmm8
7159; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7160; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm12
7161; AVX-NEXT:    vpor %xmm8, %xmm12, %xmm8
7162; AVX-NEXT:    vpshufb %xmm4, %xmm0, %xmm12
7163; AVX-NEXT:    vpalignr {{.*#+}} xmm12 = xmm12[4,5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3]
7164; AVX-NEXT:    vpshufb %xmm11, %xmm12, %xmm12
7165; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm12, %ymm8
7166; AVX-NEXT:    vandps %ymm5, %ymm13, %ymm5
7167; AVX-NEXT:    vandnps %ymm8, %ymm13, %ymm8
7168; AVX-NEXT:    vorps %ymm5, %ymm8, %ymm5
7169; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7170; AVX-NEXT:    vpshufb %xmm15, %xmm0, %xmm8
7171; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7172; AVX-NEXT:    vpshufb %xmm9, %xmm0, %xmm9
7173; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm9, %ymm8
7174; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7175; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm2
7176; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7177; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u]
7178; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
7179; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
7180; AVX-NEXT:    vandnps %ymm8, %ymm0, %ymm3
7181; AVX-NEXT:    vandps %ymm0, %ymm2, %ymm2
7182; AVX-NEXT:    vorps %ymm3, %ymm2, %ymm2
7183; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,xmm6[13,u,u,u,u],zero,zero,xmm6[14,u,u,u,u],zero,zero,xmm6[15]
7184; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7185; AVX-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
7186; AVX-NEXT:    # xmm6 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15]
7187; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[10],zero,xmm6[u,u,u,u,13,12],zero,xmm6[u,u,u,u,15,14],zero
7188; AVX-NEXT:    vpor %xmm3, %xmm6, %xmm3
7189; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7190; AVX-NEXT:    vpshufb %xmm4, %xmm0, %xmm1
7191; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7192; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
7193; AVX-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
7194; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
7195; AVX-NEXT:    vandps %ymm7, %ymm2, %ymm2
7196; AVX-NEXT:    vandnps %ymm1, %ymm7, %ymm1
7197; AVX-NEXT:    vorps %ymm1, %ymm2, %ymm1
7198; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
7199; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7200; AVX-NEXT:    vmovaps %ymm0, 128(%rax)
7201; AVX-NEXT:    vmovaps %ymm1, 96(%rax)
7202; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7203; AVX-NEXT:    vmovaps %ymm0, 64(%rax)
7204; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7205; AVX-NEXT:    vmovaps %ymm0, 32(%rax)
7206; AVX-NEXT:    vmovaps %ymm5, (%rax)
7207; AVX-NEXT:    vmovaps %ymm10, 224(%rax)
7208; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7209; AVX-NEXT:    vmovaps %ymm1, 352(%rax)
7210; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7211; AVX-NEXT:    vmovaps %ymm0, 320(%rax)
7212; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7213; AVX-NEXT:    vmovaps %ymm0, 288(%rax)
7214; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7215; AVX-NEXT:    vmovaps %ymm0, 256(%rax)
7216; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7217; AVX-NEXT:    vmovaps %ymm0, 160(%rax)
7218; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7219; AVX-NEXT:    vmovaps %ymm0, 192(%rax)
7220; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7221; AVX-NEXT:    vmovaps %xmm0, 432(%rax)
7222; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7223; AVX-NEXT:    vmovaps %xmm0, 416(%rax)
7224; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7225; AVX-NEXT:    vmovaps %xmm0, 384(%rax)
7226; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7227; AVX-NEXT:    vmovaps %xmm0, 400(%rax)
7228; AVX-NEXT:    addq $616, %rsp # imm = 0x268
7229; AVX-NEXT:    vzeroupper
7230; AVX-NEXT:    retq
7231;
7232; AVX2-LABEL: store_i8_stride7_vf64:
7233; AVX2:       # %bb.0:
7234; AVX2-NEXT:    subq $824, %rsp # imm = 0x338
7235; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
7236; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
7237; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7238; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm2
7239; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7240; AVX2-NEXT:    vmovdqa 32(%rdx), %ymm6
7241; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7242; AVX2-NEXT:    vmovdqa 32(%rcx), %ymm7
7243; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7244; AVX2-NEXT:    vmovdqa 32(%r8), %ymm5
7245; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7246; AVX2-NEXT:    vmovdqa 32(%r9), %ymm4
7247; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7248; AVX2-NEXT:    vmovdqa 32(%rax), %ymm3
7249; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7250; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,ymm0[27,20,21,26],zero,ymm0[24],zero,ymm0[26,27,26,27],zero,ymm0[25]
7251; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero
7252; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
7253; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
7254; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero
7255; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27]
7256; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
7257; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
7258; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0]
7259; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
7260; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31]
7261; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
7262; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255]
7263; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
7264; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u]
7265; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
7266; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255]
7267; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
7268; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
7269; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
7270; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255]
7271; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
7272; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7273; AVX2-NEXT:    vmovdqa (%r8), %ymm0
7274; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7275; AVX2-NEXT:    vmovdqa (%r9), %ymm1
7276; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7277; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,ymm0[27,28,29,30],zero,ymm0[28],zero,ymm0[26,27,30,31],zero,ymm0[29]
7278; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero
7279; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
7280; AVX2-NEXT:    vmovdqa (%rax), %ymm1
7281; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7282; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
7283; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
7284; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
7285; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0]
7286; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
7287; AVX2-NEXT:    vmovdqa (%rdx), %ymm1
7288; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7289; AVX2-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
7290; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,3,3,4,6,7,7]
7291; AVX2-NEXT:    vmovdqa (%rcx), %ymm2
7292; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7293; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u]
7294; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0]
7295; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
7296; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7297; AVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
7298; AVX2-NEXT:    vmovdqa (%rdi), %ymm2
7299; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7300; AVX2-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
7301; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7]
7302; AVX2-NEXT:    vmovdqa (%rsi), %ymm3
7303; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7304; AVX2-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,14,u,12,u,u,u,u,15,u,13,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u]
7305; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0]
7306; AVX2-NEXT:    # ymm4 = mem[0,1,0,1]
7307; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7308; AVX2-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
7309; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
7310; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
7311; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u]
7312; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
7313; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0]
7314; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
7315; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7316; AVX2-NEXT:    vmovdqa 32(%rsi), %xmm13
7317; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm14
7318; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
7319; AVX2-NEXT:    vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7320; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
7321; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
7322; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
7323; AVX2-NEXT:    vmovdqa 32(%rcx), %xmm11
7324; AVX2-NEXT:    vmovdqa 32(%rdx), %xmm9
7325; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3],xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7]
7326; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
7327; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
7328; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
7329; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0]
7330; AVX2-NEXT:    vpblendvb %ymm5, %ymm0, %ymm2, %ymm10
7331; AVX2-NEXT:    vmovdqa (%rsi), %xmm0
7332; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7333; AVX2-NEXT:    vmovdqa (%rdi), %xmm6
7334; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
7335; AVX2-NEXT:    vmovdqa %xmm6, (%rsp) # 16-byte Spill
7336; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
7337; AVX2-NEXT:    vmovdqa (%rcx), %xmm1
7338; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7339; AVX2-NEXT:    vmovdqa (%rdx), %xmm7
7340; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
7341; AVX2-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7342; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
7343; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
7344; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
7345; AVX2-NEXT:    vpblendvb %ymm5, %ymm0, %ymm1, %ymm4
7346; AVX2-NEXT:    vmovdqa 32(%r9), %xmm0
7347; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7348; AVX2-NEXT:    vmovdqa 32(%r8), %xmm8
7349; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
7350; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
7351; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
7352; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
7353; AVX2-NEXT:    vmovdqa 32(%rax), %xmm2
7354; AVX2-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7355; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[1,1,0,0,4,5,6,7]
7356; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
7357; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,0]
7358; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u]
7359; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
7360; AVX2-NEXT:    vmovdqa (%r9), %xmm3
7361; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7362; AVX2-NEXT:    vmovdqa (%r8), %xmm5
7363; AVX2-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7364; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
7365; AVX2-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
7366; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm0[0,1,0,1]
7367; AVX2-NEXT:    vmovdqa (%rax), %xmm0
7368; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7369; AVX2-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm0[1,1,0,0,4,5,6,7]
7370; AVX2-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0]
7371; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0]
7372; AVX2-NEXT:    vpblendvb %ymm2, %ymm12, %ymm15, %ymm2
7373; AVX2-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
7374; AVX2-NEXT:    vpblendvb %ymm12, %ymm10, %ymm1, %ymm0
7375; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7376; AVX2-NEXT:    vpblendvb %ymm12, %ymm4, %ymm2, %ymm0
7377; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7378; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
7379; AVX2-NEXT:    vpshufb %xmm1, %xmm13, %xmm2
7380; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
7381; AVX2-NEXT:    vpshufb %xmm4, %xmm14, %xmm12
7382; AVX2-NEXT:    vpor %xmm2, %xmm12, %xmm2
7383; AVX2-NEXT:    vmovdqa {{.*#+}} xmm15 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
7384; AVX2-NEXT:    vpshufb %xmm15, %xmm11, %xmm12
7385; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
7386; AVX2-NEXT:    vpshufb %xmm0, %xmm9, %xmm14
7387; AVX2-NEXT:    vpor %xmm12, %xmm14, %xmm12
7388; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
7389; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
7390; AVX2-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255]
7391; AVX2-NEXT:    vpblendvb %ymm14, %ymm2, %ymm12, %ymm2
7392; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7393; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7394; AVX2-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
7395; AVX2-NEXT:    vpshufb %xmm4, %xmm6, %xmm2
7396; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
7397; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
7398; AVX2-NEXT:    vpshufb %xmm15, %xmm6, %xmm2
7399; AVX2-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
7400; AVX2-NEXT:    vpor %xmm2, %xmm0, %xmm0
7401; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
7402; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
7403; AVX2-NEXT:    vpblendvb %ymm14, %ymm1, %ymm0, %ymm12
7404; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
7405; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
7406; AVX2-NEXT:    vpshufb %xmm1, %xmm10, %xmm2
7407; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
7408; AVX2-NEXT:    vmovdqa %xmm8, %xmm3
7409; AVX2-NEXT:    vpshufb %xmm4, %xmm8, %xmm14
7410; AVX2-NEXT:    vpor %xmm2, %xmm14, %xmm2
7411; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
7412; AVX2-NEXT:    vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
7413; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7414; AVX2-NEXT:    vpshufb %xmm14, %xmm5, %xmm15
7415; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0]
7416; AVX2-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u]
7417; AVX2-NEXT:    vpblendvb %ymm0, %ymm2, %ymm15, %ymm2
7418; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
7419; AVX2-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
7420; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7421; AVX2-NEXT:    vpshufb %xmm4, %xmm15, %xmm4
7422; AVX2-NEXT:    vpor %xmm1, %xmm4, %xmm1
7423; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
7424; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
7425; AVX2-NEXT:    vpshufb %xmm14, %xmm7, %xmm4
7426; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,0]
7427; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm4, %ymm0
7428; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
7429; AVX2-NEXT:    vpblendvb %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
7430; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7431; AVX2-NEXT:    vpblendvb %ymm1, %ymm12, %ymm0, %ymm0
7432; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7433; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15]
7434; AVX2-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload
7435; AVX2-NEXT:    # xmm1 = xmm13[8],mem[8],xmm13[9],mem[9],xmm13[10],mem[10],xmm13[11],mem[11],xmm13[12],mem[12],xmm13[13],mem[13],xmm13[14],mem[14],xmm13[15],mem[15]
7436; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
7437; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
7438; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
7439; AVX2-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
7440; AVX2-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
7441; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
7442; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u]
7443; AVX2-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
7444; AVX2-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload
7445; AVX2-NEXT:    # xmm1 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15]
7446; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
7447; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7448; AVX2-NEXT:    vpunpckhbw (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload
7449; AVX2-NEXT:    # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15]
7450; AVX2-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
7451; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
7452; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
7453; AVX2-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
7454; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
7455; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
7456; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
7457; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
7458; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5,5,6]
7459; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
7460; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
7461; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255]
7462; AVX2-NEXT:    vpblendvb %ymm5, %ymm2, %ymm4, %ymm2
7463; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15]
7464; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
7465; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1]
7466; AVX2-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,5,6]
7467; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
7468; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
7469; AVX2-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
7470; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
7471; AVX2-NEXT:    vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
7472; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7473; AVX2-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm0
7474; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7475; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
7476; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
7477; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7478; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm2
7479; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128]
7480; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7481; AVX2-NEXT:    vpshufb %ymm4, %ymm0, %ymm5
7482; AVX2-NEXT:    vpor %ymm2, %ymm5, %ymm2
7483; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
7484; AVX2-NEXT:    vpshuflw $150, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
7485; AVX2-NEXT:    # ymm5 = mem[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
7486; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,5,7]
7487; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2]
7488; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255]
7489; AVX2-NEXT:    vpblendvb %ymm6, %ymm2, %ymm5, %ymm0
7490; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7491; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
7492; AVX2-NEXT:    vpshufb %ymm3, %ymm8, %ymm3
7493; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
7494; AVX2-NEXT:    vpshufb %ymm4, %ymm13, %ymm4
7495; AVX2-NEXT:    vpor %ymm3, %ymm4, %ymm3
7496; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
7497; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
7498; AVX2-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm14[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
7499; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,5,7]
7500; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,3,2]
7501; AVX2-NEXT:    vpblendvb %ymm6, %ymm3, %ymm4, %ymm0
7502; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7503; AVX2-NEXT:    vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
7504; AVX2-NEXT:    # ymm4 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
7505; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,0,1,1,4,4,5,5]
7506; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6]
7507; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
7508; AVX2-NEXT:    vpshufb %ymm5, %ymm11, %ymm6
7509; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0]
7510; AVX2-NEXT:    # ymm7 = mem[0,1,0,1]
7511; AVX2-NEXT:    vpblendvb %ymm7, %ymm4, %ymm6, %ymm4
7512; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7513; AVX2-NEXT:    vpshufb %ymm5, %ymm1, %ymm5
7514; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7515; AVX2-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
7516; AVX2-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,0,1,1,4,4,5,5]
7517; AVX2-NEXT:    vpblendvb %ymm7, %ymm6, %ymm5, %ymm5
7518; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
7519; AVX2-NEXT:    # ymm6 = mem[0,1,0,1]
7520; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7521; AVX2-NEXT:    vpshufb %ymm6, %ymm2, %ymm7
7522; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128]
7523; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
7524; AVX2-NEXT:    vpshufb %ymm3, %ymm10, %ymm9
7525; AVX2-NEXT:    vpor %ymm7, %ymm9, %ymm7
7526; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3]
7527; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
7528; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u]
7529; AVX2-NEXT:    vpblendvb %ymm9, %ymm7, %ymm4, %ymm4
7530; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
7531; AVX2-NEXT:    vpshufb %ymm6, %ymm15, %ymm6
7532; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
7533; AVX2-NEXT:    vpshufb %ymm3, %ymm12, %ymm7
7534; AVX2-NEXT:    vpor %ymm6, %ymm7, %ymm6
7535; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3]
7536; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3]
7537; AVX2-NEXT:    vpblendvb %ymm9, %ymm6, %ymm5, %ymm5
7538; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0]
7539; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7540; AVX2-NEXT:    vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
7541; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7542; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7543; AVX2-NEXT:    vpblendvb %ymm6, %ymm5, %ymm3, %ymm3
7544; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7545; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,ymm0[27,20,21,26],zero,ymm0[24],zero,ymm0[26,27,26,27],zero,ymm0[25]
7546; AVX2-NEXT:    vmovdqa %ymm1, %ymm0
7547; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero,ymm1[27],zero
7548; AVX2-NEXT:    vpor %ymm4, %ymm5, %ymm4
7549; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero,zero,zero
7550; AVX2-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero,zero,zero,ymm12[27]
7551; AVX2-NEXT:    vpor %ymm5, %ymm6, %ymm5
7552; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
7553; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3]
7554; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0]
7555; AVX2-NEXT:    vpblendvb %ymm3, %ymm4, %ymm5, %ymm4
7556; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero
7557; AVX2-NEXT:    vpshufb {{.*#+}} ymm6 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero
7558; AVX2-NEXT:    vpor %ymm5, %ymm6, %ymm5
7559; AVX2-NEXT:    vpshufb {{.*#+}} ymm6 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
7560; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3]
7561; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3]
7562; AVX2-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u]
7563; AVX2-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
7564; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
7565; AVX2-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm1
7566; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7567; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
7568; AVX2-NEXT:    vpshufb %ymm6, %ymm11, %ymm5
7569; AVX2-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128]
7570; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
7571; AVX2-NEXT:    vpshufb %ymm7, %ymm4, %ymm8
7572; AVX2-NEXT:    vpor %ymm5, %ymm8, %ymm5
7573; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
7574; AVX2-NEXT:    vpshufb %ymm8, %ymm2, %ymm9
7575; AVX2-NEXT:    vmovdqa %ymm2, %ymm3
7576; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128]
7577; AVX2-NEXT:    vpshufb %ymm1, %ymm10, %ymm11
7578; AVX2-NEXT:    vmovdqa %ymm10, %ymm2
7579; AVX2-NEXT:    vpor %ymm9, %ymm11, %ymm9
7580; AVX2-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255]
7581; AVX2-NEXT:    vpblendvb %ymm11, %ymm5, %ymm9, %ymm5
7582; AVX2-NEXT:    vpshufb %ymm6, %ymm0, %ymm6
7583; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7584; AVX2-NEXT:    vpshufb %ymm7, %ymm0, %ymm7
7585; AVX2-NEXT:    vpor %ymm6, %ymm7, %ymm6
7586; AVX2-NEXT:    vpshufb %ymm8, %ymm15, %ymm7
7587; AVX2-NEXT:    vpshufb %ymm1, %ymm12, %ymm8
7588; AVX2-NEXT:    vpor %ymm7, %ymm8, %ymm7
7589; AVX2-NEXT:    vpblendvb %ymm11, %ymm6, %ymm7, %ymm6
7590; AVX2-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31]
7591; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7592; AVX2-NEXT:    vpshufb %ymm7, %ymm0, %ymm8
7593; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
7594; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
7595; AVX2-NEXT:    vpshufb %ymm9, %ymm15, %ymm10
7596; AVX2-NEXT:    vpor %ymm8, %ymm10, %ymm8
7597; AVX2-NEXT:    vmovdqa {{.*#+}} ymm10 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
7598; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7599; AVX2-NEXT:    vpshufb %ymm10, %ymm1, %ymm11
7600; AVX2-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u]
7601; AVX2-NEXT:    vpblendvb %ymm12, %ymm8, %ymm11, %ymm8
7602; AVX2-NEXT:    vpshufb %ymm7, %ymm13, %ymm7
7603; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
7604; AVX2-NEXT:    vpshufb %ymm9, %ymm11, %ymm9
7605; AVX2-NEXT:    vpor %ymm7, %ymm9, %ymm7
7606; AVX2-NEXT:    vpshufb %ymm10, %ymm14, %ymm9
7607; AVX2-NEXT:    vpblendvb %ymm12, %ymm7, %ymm9, %ymm7
7608; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
7609; AVX2-NEXT:    vpblendvb %ymm9, %ymm5, %ymm8, %ymm5
7610; AVX2-NEXT:    vpblendvb %ymm9, %ymm6, %ymm7, %ymm6
7611; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm3[11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u]
7612; AVX2-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
7613; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,3,4,6,7,7]
7614; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7615; AVX2-NEXT:    vpblendvb %ymm2, %ymm7, %ymm8, %ymm7
7616; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7617; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,14,u,12,u,u,u,u,15,u,13,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u]
7618; AVX2-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
7619; AVX2-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7]
7620; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7621; AVX2-NEXT:    vpblendvb %ymm2, %ymm9, %ymm8, %ymm8
7622; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3]
7623; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
7624; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u]
7625; AVX2-NEXT:    vpblendvb %ymm2, %ymm7, %ymm8, %ymm7
7626; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29]
7627; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,2]
7628; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u]
7629; AVX2-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
7630; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u]
7631; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2]
7632; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u]
7633; AVX2-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
7634; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
7635; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
7636; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0]
7637; AVX2-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
7638; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
7639; AVX2-NEXT:    vmovdqa %ymm6, 96(%rax)
7640; AVX2-NEXT:    vmovdqa %ymm5, 320(%rax)
7641; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7642; AVX2-NEXT:    vmovaps %ymm0, 160(%rax)
7643; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7644; AVX2-NEXT:    vmovaps %ymm0, 128(%rax)
7645; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7646; AVX2-NEXT:    vmovaps %ymm0, 64(%rax)
7647; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7648; AVX2-NEXT:    vmovaps %ymm0, 32(%rax)
7649; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7650; AVX2-NEXT:    vmovaps %ymm0, (%rax)
7651; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7652; AVX2-NEXT:    vmovaps %ymm0, 224(%rax)
7653; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7654; AVX2-NEXT:    vmovaps %ymm0, 352(%rax)
7655; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7656; AVX2-NEXT:    vmovaps %ymm0, 288(%rax)
7657; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7658; AVX2-NEXT:    vmovaps %ymm0, 256(%rax)
7659; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7660; AVX2-NEXT:    vmovaps %ymm0, 192(%rax)
7661; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7662; AVX2-NEXT:    vmovaps %ymm0, 384(%rax)
7663; AVX2-NEXT:    vmovdqa %ymm7, 416(%rax)
7664; AVX2-NEXT:    addq $824, %rsp # imm = 0x338
7665; AVX2-NEXT:    vzeroupper
7666; AVX2-NEXT:    retq
7667;
7668; AVX2-FP-LABEL: store_i8_stride7_vf64:
7669; AVX2-FP:       # %bb.0:
7670; AVX2-FP-NEXT:    subq $616, %rsp # imm = 0x268
7671; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
7672; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm8
7673; AVX2-FP-NEXT:    vmovdqa 32(%rsi), %ymm9
7674; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %ymm6
7675; AVX2-FP-NEXT:    vmovdqa 32(%rcx), %ymm2
7676; AVX2-FP-NEXT:    vmovdqa 32(%r8), %ymm4
7677; AVX2-FP-NEXT:    vmovdqa 32(%r9), %ymm5
7678; AVX2-FP-NEXT:    vmovdqa 32(%rax), %ymm3
7679; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[23],zero,ymm8[27,20,21,26],zero,ymm8[24],zero,ymm8[26,27,26,27],zero,ymm8[25]
7680; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7681; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm9[23],zero,zero,zero,zero,ymm9[26],zero,ymm9[24],zero,zero,zero,zero,ymm9[27],zero
7682; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7683; AVX2-FP-NEXT:    vpor %ymm0, %ymm1, %ymm0
7684; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
7685; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero
7686; AVX2-FP-NEXT:    vmovdqa %ymm2, %ymm7
7687; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7688; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27]
7689; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7690; AVX2-FP-NEXT:    vpor %ymm1, %ymm2, %ymm1
7691; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
7692; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0]
7693; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
7694; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31]
7695; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7696; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
7697; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255]
7698; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
7699; AVX2-FP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7700; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u]
7701; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
7702; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255]
7703; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
7704; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
7705; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7706; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
7707; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255]
7708; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
7709; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7710; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[17,18,19,30],zero,ymm6[28],zero,ymm6[28,29,30,31],zero,ymm6[29],zero,ymm6[31]
7711; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero
7712; AVX2-FP-NEXT:    vpor %ymm0, %ymm1, %ymm0
7713; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm9[30],zero,ymm9[28],zero,zero,zero,zero,ymm9[31],zero,ymm9[29],zero,zero,zero
7714; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero,zero,zero
7715; AVX2-FP-NEXT:    vpor %ymm1, %ymm2, %ymm1
7716; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
7717; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
7718; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u]
7719; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
7720; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29]
7721; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2]
7722; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u]
7723; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
7724; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u]
7725; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
7726; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u]
7727; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
7728; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
7729; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
7730; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0]
7731; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
7732; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7733; AVX2-FP-NEXT:    vmovdqa 32(%rsi), %xmm15
7734; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm7
7735; AVX2-FP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3],xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7]
7736; AVX2-FP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7737; AVX2-FP-NEXT:    vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7738; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
7739; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
7740; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
7741; AVX2-FP-NEXT:    vmovdqa 32(%rcx), %xmm9
7742; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %xmm11
7743; AVX2-FP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
7744; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
7745; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
7746; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
7747; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0]
7748; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm0, %ymm2, %ymm0
7749; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7750; AVX2-FP-NEXT:    vmovdqa (%rsi), %xmm0
7751; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7752; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm4
7753; AVX2-FP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
7754; AVX2-FP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7755; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
7756; AVX2-FP-NEXT:    vmovdqa (%rcx), %xmm1
7757; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7758; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm10
7759; AVX2-FP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
7760; AVX2-FP-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7761; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
7762; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
7763; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
7764; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm0, %ymm1, %ymm3
7765; AVX2-FP-NEXT:    vmovdqa 32(%rax), %xmm0
7766; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7767; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3]
7768; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm0, %xmm1
7769; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0]
7770; AVX2-FP-NEXT:    vmovdqa 32(%r9), %xmm0
7771; AVX2-FP-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
7772; AVX2-FP-NEXT:    vmovdqa 32(%r8), %xmm14
7773; AVX2-FP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
7774; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
7775; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
7776; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
7777; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u]
7778; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm5, %ymm1, %ymm1
7779; AVX2-FP-NEXT:    vmovdqa (%r9), %xmm5
7780; AVX2-FP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7781; AVX2-FP-NEXT:    vmovdqa (%r8), %xmm8
7782; AVX2-FP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7783; AVX2-FP-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
7784; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm12, %xmm12
7785; AVX2-FP-NEXT:    vmovdqa (%rax), %xmm13
7786; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm13, %xmm6
7787; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0]
7788; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
7789; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm12, %ymm6, %ymm2
7790; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
7791; AVX2-FP-NEXT:    vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
7792; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7793; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm3, %ymm2, %ymm0
7794; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7795; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
7796; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm15, %xmm2
7797; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
7798; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm7, %xmm6
7799; AVX2-FP-NEXT:    vpor %xmm2, %xmm6, %xmm2
7800; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
7801; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm9, %xmm12
7802; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
7803; AVX2-FP-NEXT:    vmovdqa %xmm11, %xmm7
7804; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm11, %xmm15
7805; AVX2-FP-NEXT:    vpor %xmm12, %xmm15, %xmm12
7806; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
7807; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
7808; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255]
7809; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm2, %ymm12, %ymm2
7810; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7811; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
7812; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm12, %xmm1
7813; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm4, %xmm2
7814; AVX2-FP-NEXT:    vpor %xmm1, %xmm2, %xmm1
7815; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7816; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm5, %xmm2
7817; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm10, %xmm0
7818; AVX2-FP-NEXT:    vpor %xmm2, %xmm0, %xmm0
7819; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
7820; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
7821; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm1, %ymm0, %ymm11
7822; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
7823; AVX2-FP-NEXT:    vmovdqa (%rsp), %xmm8 # 16-byte Reload
7824; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm8, %xmm2
7825; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
7826; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm14, %xmm6
7827; AVX2-FP-NEXT:    vpor %xmm2, %xmm6, %xmm2
7828; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
7829; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
7830; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7831; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm4, %xmm15
7832; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0]
7833; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u]
7834; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm2, %ymm15, %ymm2
7835; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
7836; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm10, %xmm1
7837; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7838; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm15, %xmm3
7839; AVX2-FP-NEXT:    vpor %xmm1, %xmm3, %xmm1
7840; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
7841; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm13, %xmm3
7842; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,0]
7843; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm3, %ymm0
7844; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
7845; AVX2-FP-NEXT:    vpblendvb %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
7846; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7847; AVX2-FP-NEXT:    vpblendvb %ymm1, %ymm11, %ymm0, %ymm0
7848; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7849; AVX2-FP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15]
7850; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7851; AVX2-FP-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
7852; AVX2-FP-NEXT:    # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15]
7853; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
7854; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
7855; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
7856; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
7857; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
7858; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
7859; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u]
7860; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm0, %ymm1, %ymm0
7861; AVX2-FP-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload
7862; AVX2-FP-NEXT:    # xmm1 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15]
7863; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
7864; AVX2-FP-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload
7865; AVX2-FP-NEXT:    # xmm2 = xmm12[8],mem[8],xmm12[9],mem[9],xmm12[10],mem[10],xmm12[11],mem[11],xmm12[12],mem[12],xmm12[13],mem[13],xmm12[14],mem[14],xmm12[15],mem[15]
7866; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
7867; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
7868; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
7869; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm1, %ymm2, %ymm1
7870; AVX2-FP-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm8[8],xmm14[9],xmm8[9],xmm14[10],xmm8[10],xmm14[11],xmm8[11],xmm14[12],xmm8[12],xmm14[13],xmm8[13],xmm14[14],xmm8[14],xmm14[15],xmm8[15]
7871; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13]
7872; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
7873; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
7874; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
7875; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
7876; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
7877; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255]
7878; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm2, %ymm4, %ymm2
7879; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm13, %xmm3
7880; AVX2-FP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15]
7881; AVX2-FP-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
7882; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1]
7883; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
7884; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
7885; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
7886; AVX2-FP-NEXT:    vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
7887; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7888; AVX2-FP-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm0
7889; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7890; AVX2-FP-NEXT:    vmovdqa (%rdx), %ymm3
7891; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[17,18,19,30],zero,ymm3[28],zero,ymm3[28,29,30,31],zero,ymm3[29],zero,ymm3[31]
7892; AVX2-FP-NEXT:    vmovdqa (%rcx), %ymm5
7893; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero
7894; AVX2-FP-NEXT:    vpor %ymm0, %ymm1, %ymm2
7895; AVX2-FP-NEXT:    vmovdqa (%rsi), %ymm11
7896; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero
7897; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm13
7898; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29],zero,zero,zero,zero
7899; AVX2-FP-NEXT:    vpor %ymm4, %ymm6, %ymm4
7900; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
7901; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
7902; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u]
7903; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm2, %ymm4, %ymm6
7904; AVX2-FP-NEXT:    vmovdqa (%r8), %ymm14
7905; AVX2-FP-NEXT:    vmovdqa (%r9), %ymm1
7906; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[27],zero,ymm14[27,28,29,30],zero,ymm14[28],zero,ymm14[26,27,30,31],zero,ymm14[29]
7907; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero
7908; AVX2-FP-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
7909; AVX2-FP-NEXT:    vpor %ymm2, %ymm7, %ymm7
7910; AVX2-FP-NEXT:    vmovdqa (%rax), %ymm10
7911; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
7912; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7913; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
7914; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3]
7915; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0]
7916; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
7917; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0]
7918; AVX2-FP-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm0
7919; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7920; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[23],zero,ymm13[27,20,21,26],zero,ymm13[24],zero,ymm13[26,27,26,27],zero,ymm13[25]
7921; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7922; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm11[23],zero,zero,zero,zero,ymm11[26],zero,ymm11[24],zero,zero,zero,zero,ymm11[27],zero
7923; AVX2-FP-NEXT:    vmovdqa %ymm11, %ymm2
7924; AVX2-FP-NEXT:    vpor %ymm6, %ymm8, %ymm6
7925; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero
7926; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7927; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27]
7928; AVX2-FP-NEXT:    vpor %ymm8, %ymm9, %ymm8
7929; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3]
7930; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
7931; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0]
7932; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm6, %ymm8, %ymm6
7933; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero
7934; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm14[25],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero,zero
7935; AVX2-FP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7936; AVX2-FP-NEXT:    vpor %ymm8, %ymm9, %ymm8
7937; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
7938; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3]
7939; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
7940; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u]
7941; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
7942; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
7943; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm6, %ymm8, %ymm0
7944; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7945; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
7946; AVX2-FP-NEXT:    # ymm9 = mem[0,1,0,1]
7947; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7948; AVX2-FP-NEXT:    vpshufb %ymm9, %ymm0, %ymm8
7949; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128]
7950; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7951; AVX2-FP-NEXT:    vpshufb %ymm10, %ymm0, %ymm11
7952; AVX2-FP-NEXT:    vpor %ymm8, %ymm11, %ymm8
7953; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
7954; AVX2-FP-NEXT:    # ymm11 = mem[0,1,0,1]
7955; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
7956; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm7, %ymm12
7957; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128]
7958; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7959; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm1, %ymm15
7960; AVX2-FP-NEXT:    vpor %ymm12, %ymm15, %ymm12
7961; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
7962; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
7963; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm15 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u]
7964; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm8, %ymm12, %ymm4
7965; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7966; AVX2-FP-NEXT:    vpshufb %ymm9, %ymm5, %ymm9
7967; AVX2-FP-NEXT:    vpshufb %ymm10, %ymm3, %ymm10
7968; AVX2-FP-NEXT:    vpor %ymm9, %ymm10, %ymm9
7969; AVX2-FP-NEXT:    vmovdqa %ymm2, %ymm3
7970; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm2, %ymm10
7971; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm13, %ymm11
7972; AVX2-FP-NEXT:    vpor %ymm10, %ymm11, %ymm10
7973; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3]
7974; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
7975; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm9, %ymm10, %ymm9
7976; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
7977; AVX2-FP-NEXT:    # ymm10 = mem[0,1,0,1]
7978; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
7979; AVX2-FP-NEXT:    vpshufb %ymm10, %ymm8, %ymm11
7980; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128]
7981; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
7982; AVX2-FP-NEXT:    vpshufb %ymm12, %ymm4, %ymm13
7983; AVX2-FP-NEXT:    vpor %ymm11, %ymm13, %ymm11
7984; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
7985; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31]
7986; AVX2-FP-NEXT:    # ymm13 = mem[0,1,0,1]
7987; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
7988; AVX2-FP-NEXT:    vpshufb %ymm13, %ymm6, %ymm15
7989; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,3,2]
7990; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255]
7991; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm11, %ymm15, %ymm11
7992; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm15 # 32-byte Reload
7993; AVX2-FP-NEXT:    vpshufb %ymm10, %ymm15, %ymm10
7994; AVX2-FP-NEXT:    vpshufb %ymm12, %ymm14, %ymm12
7995; AVX2-FP-NEXT:    vpor %ymm10, %ymm12, %ymm10
7996; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
7997; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7998; AVX2-FP-NEXT:    vpshufb %ymm13, %ymm2, %ymm12
7999; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,3,2]
8000; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm10, %ymm12, %ymm0
8001; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0]
8002; AVX2-FP-NEXT:    vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
8003; AVX2-FP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8004; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm9, %ymm0, %ymm9
8005; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
8006; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm7, %ymm10
8007; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128]
8008; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm1, %ymm12
8009; AVX2-FP-NEXT:    vpor %ymm10, %ymm12, %ymm10
8010; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
8011; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8012; AVX2-FP-NEXT:    vpshufb %ymm12, %ymm1, %ymm13
8013; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm14 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128]
8014; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8015; AVX2-FP-NEXT:    vpshufb %ymm14, %ymm1, %ymm7
8016; AVX2-FP-NEXT:    vpor %ymm7, %ymm13, %ymm7
8017; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255]
8018; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm10, %ymm7, %ymm10
8019; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
8020; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8021; AVX2-FP-NEXT:    vpshufb %ymm11, %ymm1, %ymm1
8022; AVX2-FP-NEXT:    vpor %ymm0, %ymm1, %ymm0
8023; AVX2-FP-NEXT:    vpshufb %ymm12, %ymm5, %ymm1
8024; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
8025; AVX2-FP-NEXT:    vpshufb %ymm14, %ymm3, %ymm3
8026; AVX2-FP-NEXT:    vpor %ymm1, %ymm3, %ymm1
8027; AVX2-FP-NEXT:    vpblendvb %ymm13, %ymm0, %ymm1, %ymm0
8028; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31]
8029; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
8030; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm5 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
8031; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm8, %ymm7
8032; AVX2-FP-NEXT:    vpor %ymm3, %ymm7, %ymm3
8033; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm7 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
8034; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm6, %ymm11
8035; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u]
8036; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm3, %ymm11, %ymm3
8037; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
8038; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
8039; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm15, %ymm4
8040; AVX2-FP-NEXT:    vpor %ymm1, %ymm4, %ymm1
8041; AVX2-FP-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
8042; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm1, %ymm2, %ymm1
8043; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
8044; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm10, %ymm3, %ymm3
8045; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
8046; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
8047; AVX2-FP-NEXT:    vmovdqa %ymm0, 96(%rax)
8048; AVX2-FP-NEXT:    vmovdqa %ymm3, 320(%rax)
8049; AVX2-FP-NEXT:    vmovdqa %ymm9, 128(%rax)
8050; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8051; AVX2-FP-NEXT:    vmovaps %ymm0, 352(%rax)
8052; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8053; AVX2-FP-NEXT:    vmovaps %ymm0, 160(%rax)
8054; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8055; AVX2-FP-NEXT:    vmovaps %ymm0, 192(%rax)
8056; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8057; AVX2-FP-NEXT:    vmovaps %ymm0, 64(%rax)
8058; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8059; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%rax)
8060; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8061; AVX2-FP-NEXT:    vmovaps %ymm0, (%rax)
8062; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8063; AVX2-FP-NEXT:    vmovaps %ymm0, 224(%rax)
8064; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8065; AVX2-FP-NEXT:    vmovaps %ymm0, 288(%rax)
8066; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8067; AVX2-FP-NEXT:    vmovaps %ymm0, 256(%rax)
8068; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8069; AVX2-FP-NEXT:    vmovaps %ymm0, 416(%rax)
8070; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8071; AVX2-FP-NEXT:    vmovaps %ymm0, 384(%rax)
8072; AVX2-FP-NEXT:    addq $616, %rsp # imm = 0x268
8073; AVX2-FP-NEXT:    vzeroupper
8074; AVX2-FP-NEXT:    retq
8075;
8076; AVX2-FCP-LABEL: store_i8_stride7_vf64:
8077; AVX2-FCP:       # %bb.0:
8078; AVX2-FCP-NEXT:    subq $616, %rsp # imm = 0x268
8079; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
8080; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm8
8081; AVX2-FCP-NEXT:    vmovdqa 32(%rsi), %ymm9
8082; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %ymm6
8083; AVX2-FCP-NEXT:    vmovdqa 32(%rcx), %ymm2
8084; AVX2-FCP-NEXT:    vmovdqa 32(%r8), %ymm4
8085; AVX2-FCP-NEXT:    vmovdqa 32(%r9), %ymm5
8086; AVX2-FCP-NEXT:    vmovdqa 32(%rax), %ymm3
8087; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[23],zero,ymm8[27,20,21,26],zero,ymm8[24],zero,ymm8[26,27,26,27],zero,ymm8[25]
8088; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8089; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm9[23],zero,zero,zero,zero,ymm9[26],zero,ymm9[24],zero,zero,zero,zero,ymm9[27],zero
8090; AVX2-FCP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8091; AVX2-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
8092; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
8093; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero
8094; AVX2-FCP-NEXT:    vmovdqa %ymm2, %ymm7
8095; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8096; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27]
8097; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8098; AVX2-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm1
8099; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
8100; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0]
8101; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
8102; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31]
8103; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8104; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
8105; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255]
8106; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
8107; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8108; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u]
8109; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
8110; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255]
8111; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
8112; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
8113; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8114; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
8115; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255]
8116; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
8117; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8118; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[17,18,19,30],zero,ymm6[28],zero,ymm6[28,29,30,31],zero,ymm6[29],zero,ymm6[31]
8119; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero
8120; AVX2-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
8121; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm9[30],zero,ymm9[28],zero,zero,zero,zero,ymm9[31],zero,ymm9[29],zero,zero,zero
8122; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero,zero,zero
8123; AVX2-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm1
8124; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
8125; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
8126; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u]
8127; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
8128; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29]
8129; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2]
8130; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u]
8131; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
8132; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u]
8133; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
8134; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u]
8135; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
8136; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
8137; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
8138; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0]
8139; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
8140; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8141; AVX2-FCP-NEXT:    vmovdqa 32(%rsi), %xmm14
8142; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm0
8143; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8144; AVX2-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
8145; AVX2-FCP-NEXT:    vmovdqa %xmm14, (%rsp) # 16-byte Spill
8146; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
8147; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
8148; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
8149; AVX2-FCP-NEXT:    vmovdqa 32(%rcx), %xmm10
8150; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %xmm7
8151; AVX2-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
8152; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
8153; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
8154; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
8155; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0]
8156; AVX2-FCP-NEXT:    vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
8157; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8158; AVX2-FCP-NEXT:    vmovdqa (%rsi), %xmm11
8159; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm12
8160; AVX2-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
8161; AVX2-FCP-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8162; AVX2-FCP-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8163; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
8164; AVX2-FCP-NEXT:    vmovdqa (%rcx), %xmm2
8165; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8166; AVX2-FCP-NEXT:    vmovdqa (%rdx), %xmm1
8167; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8168; AVX2-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
8169; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
8170; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
8171; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
8172; AVX2-FCP-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm6
8173; AVX2-FCP-NEXT:    vmovdqa 32(%rax), %xmm0
8174; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8175; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[1,1,0,0,4,5,6,7]
8176; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,0,1,2,0,0,1]
8177; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm4
8178; AVX2-FCP-NEXT:    vmovdqa 32(%r9), %xmm8
8179; AVX2-FCP-NEXT:    vmovdqa 32(%r8), %xmm13
8180; AVX2-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
8181; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
8182; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
8183; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
8184; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u]
8185; AVX2-FCP-NEXT:    vpblendvb %ymm1, %ymm5, %ymm4, %ymm0
8186; AVX2-FCP-NEXT:    vmovdqa (%rax), %xmm4
8187; AVX2-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8188; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[1,1,0,0,4,5,6,7]
8189; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm3, %ymm3
8190; AVX2-FCP-NEXT:    vmovdqa (%r9), %xmm9
8191; AVX2-FCP-NEXT:    vmovdqa (%r8), %xmm5
8192; AVX2-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8193; AVX2-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
8194; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm15, %xmm2
8195; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
8196; AVX2-FCP-NEXT:    vpblendvb %ymm1, %ymm2, %ymm3, %ymm1
8197; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
8198; AVX2-FCP-NEXT:    vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8199; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8200; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm6, %ymm1, %ymm0
8201; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8202; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
8203; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm14, %xmm0
8204; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
8205; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8206; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
8207; AVX2-FCP-NEXT:    vpor %xmm0, %xmm3, %xmm0
8208; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
8209; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm10, %xmm6
8210; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
8211; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm7, %xmm14
8212; AVX2-FCP-NEXT:    vpor %xmm6, %xmm14, %xmm6
8213; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
8214; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
8215; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255]
8216; AVX2-FCP-NEXT:    vpblendvb %ymm14, %ymm0, %ymm6, %ymm0
8217; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8218; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm11, %xmm1
8219; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm12, %xmm2
8220; AVX2-FCP-NEXT:    vpor %xmm1, %xmm2, %xmm1
8221; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8222; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm2
8223; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8224; AVX2-FCP-NEXT:    vpshufb %xmm15, %xmm4, %xmm3
8225; AVX2-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
8226; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
8227; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
8228; AVX2-FCP-NEXT:    vpblendvb %ymm14, %ymm1, %ymm2, %ymm1
8229; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
8230; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm8, %xmm3
8231; AVX2-FCP-NEXT:    vmovdqa %xmm8, %xmm11
8232; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
8233; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm13, %xmm14
8234; AVX2-FCP-NEXT:    vpor %xmm3, %xmm14, %xmm3
8235; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1]
8236; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
8237; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8238; AVX2-FCP-NEXT:    vpshufb %xmm14, %xmm8, %xmm15
8239; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0]
8240; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u]
8241; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm3, %ymm15, %ymm3
8242; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm9, %xmm2
8243; AVX2-FCP-NEXT:    vmovdqa %xmm9, %xmm12
8244; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8245; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm9, %xmm6
8246; AVX2-FCP-NEXT:    vpor %xmm2, %xmm6, %xmm2
8247; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
8248; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8249; AVX2-FCP-NEXT:    vpshufb %xmm14, %xmm15, %xmm6
8250; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0]
8251; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm2, %ymm6, %ymm0
8252; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
8253; AVX2-FCP-NEXT:    vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
8254; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8255; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
8256; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8257; AVX2-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm7[8],xmm10[9],xmm7[9],xmm10[10],xmm7[10],xmm10[11],xmm7[11],xmm10[12],xmm7[12],xmm10[13],xmm7[13],xmm10[14],xmm7[14],xmm10[15],xmm7[15]
8258; AVX2-FCP-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
8259; AVX2-FCP-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
8260; AVX2-FCP-NEXT:    # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15]
8261; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
8262; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
8263; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
8264; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
8265; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
8266; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
8267; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u]
8268; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm0, %ymm1, %ymm0
8269; AVX2-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
8270; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
8271; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8272; AVX2-FCP-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
8273; AVX2-FCP-NEXT:    # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15]
8274; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
8275; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
8276; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
8277; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm1, %ymm2, %ymm1
8278; AVX2-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15]
8279; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,5,6]
8280; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3]
8281; AVX2-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
8282; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm6, %ymm3
8283; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
8284; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
8285; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
8286; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255]
8287; AVX2-FCP-NEXT:    vpblendvb %ymm5, %ymm2, %ymm3, %ymm2
8288; AVX2-FCP-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,4,5,5,6]
8289; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm6, %ymm3
8290; AVX2-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15]
8291; AVX2-FCP-NEXT:    vpshufb %xmm7, %xmm4, %xmm4
8292; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
8293; AVX2-FCP-NEXT:    vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
8294; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
8295; AVX2-FCP-NEXT:    vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
8296; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8297; AVX2-FCP-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm0
8298; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8299; AVX2-FCP-NEXT:    vmovdqa (%rdx), %ymm3
8300; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[17,18,19,30],zero,ymm3[28],zero,ymm3[28,29,30,31],zero,ymm3[29],zero,ymm3[31]
8301; AVX2-FCP-NEXT:    vmovdqa (%rcx), %ymm5
8302; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero
8303; AVX2-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm2
8304; AVX2-FCP-NEXT:    vmovdqa (%rsi), %ymm11
8305; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero
8306; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm13
8307; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29],zero,zero,zero,zero
8308; AVX2-FCP-NEXT:    vpor %ymm4, %ymm6, %ymm4
8309; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
8310; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
8311; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u]
8312; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm2, %ymm4, %ymm6
8313; AVX2-FCP-NEXT:    vmovdqa (%r8), %ymm14
8314; AVX2-FCP-NEXT:    vmovdqa (%r9), %ymm1
8315; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[27],zero,ymm14[27,28,29,30],zero,ymm14[28],zero,ymm14[26,27,30,31],zero,ymm14[29]
8316; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero
8317; AVX2-FCP-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
8318; AVX2-FCP-NEXT:    vpor %ymm2, %ymm7, %ymm7
8319; AVX2-FCP-NEXT:    vmovdqa (%rax), %ymm10
8320; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
8321; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8322; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
8323; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3]
8324; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0]
8325; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
8326; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0]
8327; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm0
8328; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8329; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[23],zero,ymm13[27,20,21,26],zero,ymm13[24],zero,ymm13[26,27,26,27],zero,ymm13[25]
8330; AVX2-FCP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8331; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm11[23],zero,zero,zero,zero,ymm11[26],zero,ymm11[24],zero,zero,zero,zero,ymm11[27],zero
8332; AVX2-FCP-NEXT:    vmovdqa %ymm11, %ymm2
8333; AVX2-FCP-NEXT:    vpor %ymm6, %ymm8, %ymm6
8334; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero
8335; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8336; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27]
8337; AVX2-FCP-NEXT:    vpor %ymm8, %ymm9, %ymm8
8338; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3]
8339; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
8340; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0]
8341; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm6, %ymm8, %ymm6
8342; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero
8343; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm14[25],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero,zero
8344; AVX2-FCP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8345; AVX2-FCP-NEXT:    vpor %ymm8, %ymm9, %ymm8
8346; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
8347; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3]
8348; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
8349; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u]
8350; AVX2-FCP-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
8351; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
8352; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm6, %ymm8, %ymm0
8353; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8354; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
8355; AVX2-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
8356; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8357; AVX2-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm8
8358; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128]
8359; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8360; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm0, %ymm11
8361; AVX2-FCP-NEXT:    vpor %ymm8, %ymm11, %ymm8
8362; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
8363; AVX2-FCP-NEXT:    # ymm11 = mem[0,1,0,1]
8364; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
8365; AVX2-FCP-NEXT:    vpshufb %ymm11, %ymm7, %ymm12
8366; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128]
8367; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8368; AVX2-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm15
8369; AVX2-FCP-NEXT:    vpor %ymm12, %ymm15, %ymm12
8370; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
8371; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
8372; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u]
8373; AVX2-FCP-NEXT:    vpblendvb %ymm15, %ymm8, %ymm12, %ymm4
8374; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8375; AVX2-FCP-NEXT:    vpshufb %ymm9, %ymm5, %ymm9
8376; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm3, %ymm10
8377; AVX2-FCP-NEXT:    vpor %ymm9, %ymm10, %ymm9
8378; AVX2-FCP-NEXT:    vmovdqa %ymm2, %ymm3
8379; AVX2-FCP-NEXT:    vpshufb %ymm11, %ymm2, %ymm10
8380; AVX2-FCP-NEXT:    vpshufb %ymm0, %ymm13, %ymm11
8381; AVX2-FCP-NEXT:    vpor %ymm10, %ymm11, %ymm10
8382; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3]
8383; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
8384; AVX2-FCP-NEXT:    vpblendvb %ymm15, %ymm9, %ymm10, %ymm9
8385; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
8386; AVX2-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
8387; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
8388; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm8, %ymm11
8389; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128]
8390; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
8391; AVX2-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm13
8392; AVX2-FCP-NEXT:    vpor %ymm11, %ymm13, %ymm11
8393; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
8394; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
8395; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm6[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
8396; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [4,5,4,5,5,7,4,5]
8397; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm15, %ymm13
8398; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255]
8399; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm11, %ymm13, %ymm11
8400; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm4 # 32-byte Reload
8401; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm10
8402; AVX2-FCP-NEXT:    vpshufb %ymm12, %ymm14, %ymm12
8403; AVX2-FCP-NEXT:    vpor %ymm10, %ymm12, %ymm10
8404; AVX2-FCP-NEXT:    vpshuflw $150, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
8405; AVX2-FCP-NEXT:    # ymm12 = mem[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
8406; AVX2-FCP-NEXT:    vpermd %ymm12, %ymm15, %ymm12
8407; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
8408; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm10, %ymm12, %ymm0
8409; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0]
8410; AVX2-FCP-NEXT:    vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload
8411; AVX2-FCP-NEXT:    vpblendvb %ymm10, %ymm9, %ymm0, %ymm9
8412; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
8413; AVX2-FCP-NEXT:    vpshufb %ymm0, %ymm7, %ymm10
8414; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128]
8415; AVX2-FCP-NEXT:    vpshufb %ymm11, %ymm1, %ymm12
8416; AVX2-FCP-NEXT:    vpor %ymm10, %ymm12, %ymm10
8417; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
8418; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8419; AVX2-FCP-NEXT:    vpshufb %ymm12, %ymm1, %ymm13
8420; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128]
8421; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8422; AVX2-FCP-NEXT:    vpshufb %ymm15, %ymm1, %ymm7
8423; AVX2-FCP-NEXT:    vpor %ymm7, %ymm13, %ymm7
8424; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255]
8425; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm10, %ymm7, %ymm10
8426; AVX2-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
8427; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8428; AVX2-FCP-NEXT:    vpshufb %ymm11, %ymm1, %ymm1
8429; AVX2-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
8430; AVX2-FCP-NEXT:    vpshufb %ymm12, %ymm5, %ymm1
8431; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
8432; AVX2-FCP-NEXT:    vpshufb %ymm15, %ymm3, %ymm3
8433; AVX2-FCP-NEXT:    vpor %ymm1, %ymm3, %ymm1
8434; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm0, %ymm1, %ymm0
8435; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31]
8436; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm3
8437; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
8438; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm8, %ymm7
8439; AVX2-FCP-NEXT:    vpor %ymm3, %ymm7, %ymm3
8440; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
8441; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm6, %ymm11
8442; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u]
8443; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm3, %ymm11, %ymm3
8444; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
8445; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
8446; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
8447; AVX2-FCP-NEXT:    vpor %ymm1, %ymm4, %ymm1
8448; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
8449; AVX2-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
8450; AVX2-FCP-NEXT:    vpblendvb %ymm12, %ymm1, %ymm2, %ymm1
8451; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
8452; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm10, %ymm3, %ymm3
8453; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
8454; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
8455; AVX2-FCP-NEXT:    vmovdqa %ymm0, 96(%rax)
8456; AVX2-FCP-NEXT:    vmovdqa %ymm3, 320(%rax)
8457; AVX2-FCP-NEXT:    vmovdqa %ymm9, 128(%rax)
8458; AVX2-FCP-NEXT:    vmovdqa %ymm14, 352(%rax)
8459; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8460; AVX2-FCP-NEXT:    vmovaps %ymm0, 160(%rax)
8461; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8462; AVX2-FCP-NEXT:    vmovaps %ymm0, 192(%rax)
8463; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8464; AVX2-FCP-NEXT:    vmovaps %ymm0, 64(%rax)
8465; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8466; AVX2-FCP-NEXT:    vmovaps %ymm0, 32(%rax)
8467; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8468; AVX2-FCP-NEXT:    vmovaps %ymm0, (%rax)
8469; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8470; AVX2-FCP-NEXT:    vmovaps %ymm0, 224(%rax)
8471; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8472; AVX2-FCP-NEXT:    vmovaps %ymm0, 288(%rax)
8473; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8474; AVX2-FCP-NEXT:    vmovaps %ymm0, 256(%rax)
8475; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8476; AVX2-FCP-NEXT:    vmovaps %ymm0, 416(%rax)
8477; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8478; AVX2-FCP-NEXT:    vmovaps %ymm0, 384(%rax)
8479; AVX2-FCP-NEXT:    addq $616, %rsp # imm = 0x268
8480; AVX2-FCP-NEXT:    vzeroupper
8481; AVX2-FCP-NEXT:    retq
8482;
8483; AVX512-LABEL: store_i8_stride7_vf64:
8484; AVX512:       # %bb.0:
8485; AVX512-NEXT:    subq $1384, %rsp # imm = 0x568
8486; AVX512-NEXT:    vmovdqa (%rsi), %ymm7
8487; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero,zero,zero,ymm7[18]
8488; AVX512-NEXT:    vmovdqa (%rdi), %ymm2
8489; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero
8490; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm20
8491; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
8492; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8493; AVX512-NEXT:    vmovdqa (%rcx), %ymm15
8494; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
8495; AVX512-NEXT:    vpshufb %ymm2, %ymm15, %ymm0
8496; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm23
8497; AVX512-NEXT:    vmovdqa (%rdx), %ymm2
8498; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
8499; AVX512-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
8500; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm26
8501; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm18
8502; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
8503; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8504; AVX512-NEXT:    vmovdqa (%r8), %ymm14
8505; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
8506; AVX512-NEXT:    vpshufb %ymm2, %ymm14, %ymm0
8507; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm27
8508; AVX512-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8509; AVX512-NEXT:    vmovdqa (%r9), %ymm8
8510; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0]
8511; AVX512-NEXT:    # ymm3 = mem[0,1,0,1]
8512; AVX512-NEXT:    vpshufb %ymm3, %ymm8, %ymm1
8513; AVX512-NEXT:    vmovdqa64 %ymm3, %ymm17
8514; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
8515; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8516; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm10
8517; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
8518; AVX512-NEXT:    # ymm6 = mem[0,1,0,1]
8519; AVX512-NEXT:    vpshufb %ymm6, %ymm10, %ymm0
8520; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm9
8521; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm9[23],zero,zero,zero,zero,ymm9[26],zero,ymm9[24],zero,zero,zero,zero,ymm9[27],zero,ymm9[25]
8522; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
8523; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8524; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
8525; AVX512-NEXT:    vpshufb %ymm1, %ymm9, %ymm0
8526; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm19
8527; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm10[23,u,u,u],zero,ymm10[26],zero,ymm10[24,u,u,u],zero,ymm10[27],zero
8528; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
8529; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8530; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm5
8531; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
8532; AVX512-NEXT:    # ymm1 = mem[0,1,0,1]
8533; AVX512-NEXT:    vpshufb %ymm1, %ymm5, %ymm2
8534; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm4
8535; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero
8536; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
8537; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8538; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0]
8539; AVX512-NEXT:    # ymm0 = mem[0,1,0,1]
8540; AVX512-NEXT:    vpshufb %ymm0, %ymm4, %ymm2
8541; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm21
8542; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27]
8543; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
8544; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8545; AVX512-NEXT:    vmovdqa 32(%r8), %ymm3
8546; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
8547; AVX512-NEXT:    # ymm0 = mem[0,1,0,1]
8548; AVX512-NEXT:    vpshufb %ymm0, %ymm3, %ymm11
8549; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm25
8550; AVX512-NEXT:    vmovdqa 32(%r9), %ymm2
8551; AVX512-NEXT:    vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
8552; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm12, %zmm11
8553; AVX512-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8554; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128]
8555; AVX512-NEXT:    # ymm11 = mem[0,1,0,1]
8556; AVX512-NEXT:    vpshufb %ymm11, %ymm2, %ymm12
8557; AVX512-NEXT:    vpshufb {{.*#+}} ymm13 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm3[23],zero,ymm3[23,24,25,26],zero,ymm3[24],zero,ymm3[30,31]
8558; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm12
8559; AVX512-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8560; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
8561; AVX512-NEXT:    vmovdqa 32(%rax), %ymm0
8562; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8563; AVX512-NEXT:    vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
8564; AVX512-NEXT:    vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
8565; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm12
8566; AVX512-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8567; AVX512-NEXT:    vpshufb %ymm6, %ymm7, %ymm6
8568; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm22
8569; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm7
8570; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm0
8571; AVX512-NEXT:    vpshufb %ymm0, %ymm7, %ymm7
8572; AVX512-NEXT:    vpor %ymm6, %ymm7, %ymm0
8573; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8574; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm12
8575; AVX512-NEXT:    vpshufb %ymm1, %ymm12, %ymm6
8576; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm0
8577; AVX512-NEXT:    vpshufb %ymm0, %ymm15, %ymm7
8578; AVX512-NEXT:    vmovdqa64 %ymm15, %ymm19
8579; AVX512-NEXT:    vpor %ymm6, %ymm7, %ymm0
8580; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8581; AVX512-NEXT:    vmovdqa64 %ymm25, %ymm0
8582; AVX512-NEXT:    vpshufb %ymm0, %ymm14, %ymm6
8583; AVX512-NEXT:    vpshufb %ymm11, %ymm8, %ymm7
8584; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm16
8585; AVX512-NEXT:    vpor %ymm6, %ymm7, %ymm0
8586; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8587; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm0
8588; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8589; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm1
8590; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8591; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
8592; AVX512-NEXT:    vpshufb %xmm11, %xmm1, %xmm6
8593; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
8594; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm7
8595; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm21
8596; AVX512-NEXT:    vpor %xmm6, %xmm7, %xmm0
8597; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8598; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm0
8599; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm15
8600; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
8601; AVX512-NEXT:    vpshufb %xmm6, %xmm15, %xmm7
8602; AVX512-NEXT:    vmovdqa {{.*#+}} xmm13 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
8603; AVX512-NEXT:    vpshufb %xmm13, %xmm0, %xmm8
8604; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm30
8605; AVX512-NEXT:    vpor %xmm7, %xmm8, %xmm0
8606; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8607; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0]
8608; AVX512-NEXT:    vmovdqa 32(%rax), %xmm0
8609; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8610; AVX512-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5,5,6]
8611; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
8612; AVX512-NEXT:    vpermi2d %zmm7, %zmm8, %zmm28
8613; AVX512-NEXT:    vmovdqa 32(%r9), %xmm0
8614; AVX512-NEXT:    vmovdqa 32(%r8), %xmm14
8615; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
8616; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm7
8617; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm24
8618; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm18
8619; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
8620; AVX512-NEXT:    vpshufb %xmm0, %xmm14, %xmm8
8621; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm29
8622; AVX512-NEXT:    vporq %xmm7, %xmm8, %xmm31
8623; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm0
8624; AVX512-NEXT:    vpshufb %ymm0, %ymm4, %ymm7
8625; AVX512-NEXT:    vmovdqa64 %ymm26, %ymm0
8626; AVX512-NEXT:    vpshufb %ymm0, %ymm5, %ymm8
8627; AVX512-NEXT:    vpor %ymm7, %ymm8, %ymm0
8628; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8629; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
8630; AVX512-NEXT:    # ymm1 = mem[0,1,0,1]
8631; AVX512-NEXT:    vpshufb %ymm1, %ymm4, %ymm0
8632; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm23
8633; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
8634; AVX512-NEXT:    # ymm7 = mem[0,1,0,1]
8635; AVX512-NEXT:    vpshufb %ymm7, %ymm5, %ymm1
8636; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
8637; AVX512-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8638; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18]
8639; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[0,1,14],zero,ymm9[12,13,0,1,14,15],zero,ymm9[3,12,13,2,3,16],zero,ymm9[30,31,28,29,16,17],zero,ymm9[31,18,19,28,29,18],zero
8640; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
8641; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8642; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
8643; AVX512-NEXT:    # ymm0 = mem[0,1,0,1]
8644; AVX512-NEXT:    vpshufb %ymm0, %ymm10, %ymm1
8645; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
8646; AVX512-NEXT:    vpshufb %ymm8, %ymm9, %ymm4
8647; AVX512-NEXT:    vpor %ymm1, %ymm4, %ymm1
8648; AVX512-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8649; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
8650; AVX512-NEXT:    # ymm1 = mem[0,1,0,1]
8651; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm4
8652; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
8653; AVX512-NEXT:    # ymm9 = mem[0,1,0,1]
8654; AVX512-NEXT:    vpshufb %ymm9, %ymm3, %ymm5
8655; AVX512-NEXT:    vpor %ymm4, %ymm5, %ymm4
8656; AVX512-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8657; AVX512-NEXT:    vmovdqa64 %ymm27, %ymm4
8658; AVX512-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
8659; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm4
8660; AVX512-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
8661; AVX512-NEXT:    vpor %ymm3, %ymm2, %ymm2
8662; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8663; AVX512-NEXT:    vmovdqa (%rsi), %xmm3
8664; AVX512-NEXT:    vpshufb %xmm6, %xmm3, %xmm2
8665; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm25
8666; AVX512-NEXT:    vmovdqa (%rdi), %xmm4
8667; AVX512-NEXT:    vpshufb %xmm13, %xmm4, %xmm3
8668; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm27
8669; AVX512-NEXT:    vpor %xmm2, %xmm3, %xmm2
8670; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8671; AVX512-NEXT:    vmovdqa (%rcx), %xmm3
8672; AVX512-NEXT:    vpshufb %xmm11, %xmm3, %xmm2
8673; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm17
8674; AVX512-NEXT:    vmovdqa (%rdx), %xmm13
8675; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm3
8676; AVX512-NEXT:    vpshufb %xmm3, %xmm13, %xmm3
8677; AVX512-NEXT:    vpor %xmm2, %xmm3, %xmm2
8678; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8679; AVX512-NEXT:    vmovdqa (%r9), %xmm11
8680; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm2
8681; AVX512-NEXT:    vpshufb %xmm2, %xmm11, %xmm2
8682; AVX512-NEXT:    vmovdqa (%r8), %xmm10
8683; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm3
8684; AVX512-NEXT:    vpshufb %xmm3, %xmm10, %xmm3
8685; AVX512-NEXT:    vpor %xmm2, %xmm3, %xmm2
8686; AVX512-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8687; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm4
8688; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero,ymm4[27],zero,ymm4[25]
8689; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm3
8690; AVX512-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
8691; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
8692; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8693; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero
8694; AVX512-NEXT:    vpshufb %ymm8, %ymm4, %ymm2
8695; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm29
8696; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm2
8697; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero
8698; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm3
8699; AVX512-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
8700; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm26
8701; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm12[23],zero,ymm12[21,22,23,26],zero,ymm12[24],zero,ymm12[28,29,26,27]
8702; AVX512-NEXT:    vpshufb %ymm7, %ymm12, %ymm2
8703; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm23
8704; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm2
8705; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
8706; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
8707; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
8708; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8709; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8710; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31]
8711; AVX512-NEXT:    vpshufb %ymm9, %ymm1, %ymm1
8712; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
8713; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8714; AVX512-NEXT:    vmovdqa (%rax), %ymm8
8715; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
8716; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23]
8717; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm8[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
8718; AVX512-NEXT:    vpermi2d %zmm1, %zmm2, %zmm16
8719; AVX512-NEXT:    vmovdqa (%rax), %xmm5
8720; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,5,5,6]
8721; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
8722; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
8723; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
8724; AVX512-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
8725; AVX512-NEXT:    vpshufb %ymm9, %ymm8, %ymm3
8726; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm19
8727; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm4
8728; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7]
8729; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
8730; AVX512-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
8731; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm24
8732; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8733; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm0[2,3,2,3],zmm3[0,1,0,1]
8734; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8735; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8736; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
8737; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
8738; AVX512-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
8739; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm21
8740; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8741; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm0[2,3,2,3],zmm3[0,1,0,1]
8742; AVX512-NEXT:    vmovdqa64 %xmm18, %xmm2
8743; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7]
8744; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
8745; AVX512-NEXT:    vpshufb %xmm0, %xmm12, %xmm12
8746; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm20
8747; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8748; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm30 = zmm0[2,3,2,3],zmm12[0,1,0,1]
8749; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
8750; AVX512-NEXT:    vmovdqa64 %ymm0, %ymm18
8751; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
8752; AVX512-NEXT:    vmovdqa64 %xmm17, %xmm3
8753; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15]
8754; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
8755; AVX512-NEXT:    vpshufb %xmm1, %xmm8, %xmm12
8756; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
8757; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8758; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm8
8759; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15]
8760; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm4
8761; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm7
8762; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
8763; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
8764; AVX512-NEXT:    vpshufb %xmm0, %xmm15, %xmm15
8765; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
8766; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8767; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm17
8768; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15]
8769; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15]
8770; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
8771; AVX512-NEXT:    vpshufb %xmm0, %xmm14, %xmm14
8772; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
8773; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm31, %zmm31
8774; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8775; AVX512-NEXT:    vpshufb %ymm9, %ymm0, %ymm1
8776; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
8777; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,5,7]
8778; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2]
8779; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9
8780; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm1, %zmm9
8781; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
8782; AVX512-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm6 ^ (zmm1 & (zmm22 ^ zmm6))
8783; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
8784; AVX512-NEXT:    # ymm6 = mem[2,3,2,3]
8785; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8786; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm6
8787; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
8788; AVX512-NEXT:    # ymm27 = mem[2,3,2,3]
8789; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8790; AVX512-NEXT:    vinserti64x4 $1, %ymm27, %zmm0, %zmm27
8791; AVX512-NEXT:    vpternlogq {{.*#+}} zmm27 = zmm6 ^ (zmm1 & (zmm27 ^ zmm6))
8792; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
8793; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm0
8794; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
8795; AVX512-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
8796; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
8797; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm0
8798; AVX512-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
8799; AVX512-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 16-byte Folded Reload
8800; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
8801; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm0
8802; AVX512-NEXT:    vpshufb %xmm0, %xmm10, %xmm10
8803; AVX512-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 16-byte Folded Reload
8804; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm5[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
8805; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,1,0,0,4,5,6,7]
8806; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
8807; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm5, %zmm5
8808; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
8809; AVX512-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload
8810; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
8811; AVX512-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload
8812; AVX512-NEXT:    vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7]
8813; AVX512-NEXT:    vpermq {{.*#+}} zmm13 = zmm13[2,3,2,3,6,7,6,7]
8814; AVX512-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm11))
8815; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
8816; AVX512-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload
8817; AVX512-NEXT:    vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7]
8818; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
8819; AVX512-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm21 & (zmm11 ^ zmm13))
8820; AVX512-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5]
8821; AVX512-NEXT:    vpermq {{.*#+}} zmm7 = zmm17[0,1,0,1,4,5,4,5]
8822; AVX512-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm8 ^ (zmm21 & (zmm7 ^ zmm8))
8823; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm15[0,1,0,1]
8824; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
8825; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm14[0,1,0,1]
8826; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm18[2,3,2,3]
8827; AVX512-NEXT:    vpshuflw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
8828; AVX512-NEXT:    # xmm15 = mem[1,1,0,0,4,5,6,7]
8829; AVX512-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0]
8830; AVX512-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm20 # 64-byte Folded Reload
8831; AVX512-NEXT:    vporq %zmm26, %zmm23, %zmm17
8832; AVX512-NEXT:    vpermq {{.*#+}} zmm18 = zmm20[2,3,2,3,6,7,6,7]
8833; AVX512-NEXT:    vpermq {{.*#+}} zmm17 = zmm17[2,3,2,3,6,7,6,7]
8834; AVX512-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm18 ^ (zmm21 & (zmm17 ^ zmm18))
8835; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload
8836; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 32-byte Folded Reload
8837; AVX512-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm8))
8838; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 32-byte Folded Reload
8839; AVX512-NEXT:    vpternlogq {{.*#+}} zmm19 = zmm19 | (zmm8 & mem)
8840; AVX512-NEXT:    vpternlogq {{.*#+}} zmm19 = zmm19 ^ (mem & (zmm19 ^ zmm12))
8841; AVX512-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload
8842; AVX512-NEXT:    # zmm8 = mem[2,3,2,3,6,7,6,7]
8843; AVX512-NEXT:    vpternlogd {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm11))
8844; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm15[0,0,1,0]
8845; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm14, %zmm11
8846; AVX512-NEXT:    vpternlogd {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm30))
8847; AVX512-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm22))
8848; AVX512-NEXT:    vpermq {{.*#+}} zmm0 = zmm31[0,1,0,1,4,5,4,5]
8849; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm28 ^ (mem & (zmm0 ^ zmm28))
8850; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm7))
8851; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
8852; AVX512-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload
8853; AVX512-NEXT:    # zmm3 = zmm3[0,1,2,3],mem[2,3,2,3]
8854; AVX512-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm3 & mem)
8855; AVX512-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm27))
8856; AVX512-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5]
8857; AVX512-NEXT:    vpermq {{.*#+}} zmm3 = zmm6[0,1,0,1,4,5,4,5]
8858; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm1))
8859; AVX512-NEXT:    vpermq {{.*#+}} zmm1 = zmm10[0,1,0,1,4,5,4,5]
8860; AVX512-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[0,0,1,0,4,4,5,4]
8861; AVX512-NEXT:    vpternlogd {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm1))
8862; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm3))
8863; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8864; AVX512-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
8865; AVX512-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7]
8866; AVX512-NEXT:    vpternlogd {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm1))
8867; AVX512-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm17))
8868; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
8869; AVX512-NEXT:    vmovdqa64 %zmm16, 128(%rax)
8870; AVX512-NEXT:    vmovdqa64 %zmm5, (%rax)
8871; AVX512-NEXT:    vmovdqa64 %zmm9, 320(%rax)
8872; AVX512-NEXT:    vmovdqa64 %zmm0, 256(%rax)
8873; AVX512-NEXT:    vmovdqa64 %zmm11, 192(%rax)
8874; AVX512-NEXT:    vmovdqa64 %zmm8, 384(%rax)
8875; AVX512-NEXT:    vmovdqa64 %zmm19, 64(%rax)
8876; AVX512-NEXT:    addq $1384, %rsp # imm = 0x568
8877; AVX512-NEXT:    vzeroupper
8878; AVX512-NEXT:    retq
8879;
8880; AVX512-FCP-LABEL: store_i8_stride7_vf64:
8881; AVX512-FCP:       # %bb.0:
8882; AVX512-FCP-NEXT:    subq $1432, %rsp # imm = 0x598
8883; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm3
8884; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero
8885; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
8886; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8887; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
8888; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
8889; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
8890; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm25
8891; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
8892; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8893; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm2[28],zero,ymm2[30,31,30,31],zero,ymm2[29],zero,ymm2[31,28,29]
8894; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128]
8895; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
8896; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm1
8897; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm16
8898; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm27
8899; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
8900; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8901; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm2
8902; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero
8903; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm15
8904; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
8905; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
8906; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm15, %ymm1
8907; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm29
8908; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
8909; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8910; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm15[30],zero,ymm15[28,u,u,u],zero,ymm15[31],zero,ymm15[29,u]
8911; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
8912; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
8913; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
8914; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm31
8915; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm26
8916; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
8917; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8918; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm2
8919; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29]
8920; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
8921; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
8922; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
8923; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm1
8924; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm21
8925; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
8926; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8927; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm3[27,u,u,u],zero,ymm3[30],zero,ymm3[28,u,u,u],zero,ymm3[31],zero
8928; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm23
8929; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
8930; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
8931; AVX512-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
8932; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm20
8933; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm19
8934; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
8935; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8936; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
8937; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %ymm1
8938; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8939; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
8940; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
8941; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
8942; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8943; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm6
8944; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18]
8945; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm5
8946; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[0,1,14],zero,ymm5[12,13,0,1,14,15],zero,ymm5[3,12,13,2,3,16],zero,ymm5[30,31,28,29,16,17],zero,ymm5[31,18,19,28,29,18],zero
8947; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
8948; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8949; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm3
8950; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero
8951; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm2
8952; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,0,1,14],zero,ymm2[14,15,0,1,14,15],zero,ymm2[13,14,15,16,17,16],zero,ymm2[30,31,30,31,16,17],zero,ymm2[31,28,29,30,31]
8953; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
8954; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8955; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm1
8956; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero
8957; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm14
8958; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm14[13,u,u,u,u,u],zero,ymm14[14,u,u,u,u,u],zero,ymm14[15,u,u,u,u,u],zero,ymm14[16,u,u,u,u,u],zero,ymm14[17,u,u,u]
8959; AVX512-FCP-NEXT:    vpor %ymm4, %ymm7, %ymm4
8960; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8961; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm9
8962; AVX512-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8963; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm0
8964; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8965; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
8966; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm7
8967; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
8968; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm9, %xmm9
8969; AVX512-FCP-NEXT:    vpor %xmm7, %xmm9, %xmm0
8970; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8971; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm0
8972; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm9
8973; AVX512-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8974; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
8975; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm9
8976; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
8977; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm11
8978; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm28
8979; AVX512-FCP-NEXT:    vpor %xmm9, %xmm11, %xmm0
8980; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8981; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm9
8982; AVX512-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8983; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm0
8984; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8985; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
8986; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm9, %xmm9
8987; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
8988; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm12
8989; AVX512-FCP-NEXT:    vpor %xmm9, %xmm12, %xmm0
8990; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8991; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm0
8992; AVX512-FCP-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
8993; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm24
8994; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
8995; AVX512-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm10
8996; AVX512-FCP-NEXT:    vmovdqa64 %xmm0, %xmm17
8997; AVX512-FCP-NEXT:    vpor %xmm7, %xmm10, %xmm0
8998; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8999; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm10
9000; AVX512-FCP-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
9001; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm9
9002; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm9, %xmm8
9003; AVX512-FCP-NEXT:    vpor %xmm4, %xmm8, %xmm0
9004; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9005; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm8
9006; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm11
9007; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm7
9008; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm7, %xmm13
9009; AVX512-FCP-NEXT:    vporq %xmm11, %xmm13, %xmm30
9010; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm0
9011; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm11
9012; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
9013; AVX512-FCP-NEXT:    # ymm12 = mem[0,1,0,1]
9014; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm6, %ymm13
9015; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
9016; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9017; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm0
9018; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm6, %ymm11
9019; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
9020; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
9021; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm13
9022; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm22
9023; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
9024; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9025; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm0
9026; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm11
9027; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
9028; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
9029; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm13
9030; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm29
9031; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
9032; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9033; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm0
9034; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm11
9035; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
9036; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
9037; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm13
9038; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm25
9039; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm0
9040; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm11
9041; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
9042; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
9043; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm13
9044; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm18
9045; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
9046; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9047; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm0
9048; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm11
9049; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
9050; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
9051; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm13
9052; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm16
9053; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
9054; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9055; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero,zero
9056; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm5[28],zero,ymm5[30,31,30,31],zero,ymm5[29],zero,ymm5[31,28,29]
9057; AVX512-FCP-NEXT:    vporq %ymm6, %ymm5, %ymm20
9058; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero
9059; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm3[30],zero,ymm3[28,u,u,u],zero,ymm3[31],zero,ymm3[29,u]
9060; AVX512-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm0
9061; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9062; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29]
9063; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm14[27,u,u,u],zero,ymm14[30],zero,ymm14[28,u,u,u],zero,ymm14[31],zero
9064; AVX512-FCP-NEXT:    vporq %ymm1, %ymm0, %ymm21
9065; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero,zero,zero,zero
9066; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm2
9067; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,0,1,14],zero,ymm2[14,15,0,1,14,15],zero,ymm2[13,14,15,16,17,16],zero,ymm2[30,31,30,31,16,17],zero,ymm2[31,28,29,30,31]
9068; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
9069; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9070; AVX512-FCP-NEXT:    vmovdqa64 %ymm29, %ymm0
9071; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm15, %ymm0
9072; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
9073; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
9074; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9075; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm3
9076; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18]
9077; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9078; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero
9079; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
9080; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9081; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm0
9082; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm1
9083; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
9084; AVX512-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm29
9085; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm2
9086; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm0
9087; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
9088; AVX512-FCP-NEXT:    vmovdqa64 %ymm19, %ymm3
9089; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm1
9090; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
9091; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
9092; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9093; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero
9094; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u]
9095; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
9096; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9097; AVX512-FCP-NEXT:    vmovdqa (%rax), %xmm1
9098; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [2,2,3,3,2,2,3,3]
9099; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
9100; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,5,6]
9101; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm23
9102; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm3, %ymm0
9103; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
9104; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm15
9105; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,ymm15[13],zero,zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero
9106; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm18
9107; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
9108; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
9109; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
9110; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm27
9111; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9112; AVX512-FCP-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload
9113; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, %xmm1
9114; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15]
9115; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
9116; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9117; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm19
9118; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm4
9119; AVX512-FCP-NEXT:    vmovdqa64 %xmm17, %xmm1
9120; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
9121; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
9122; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
9123; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm24
9124; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9125; AVX512-FCP-NEXT:    vmovdqa64 %xmm28, %xmm5
9126; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15]
9127; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
9128; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9129; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm17
9130; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %xmm6
9131; AVX512-FCP-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm6[0,1,2,3,4,5,5,6]
9132; AVX512-FCP-NEXT:    vpermd %ymm14, %ymm3, %ymm31
9133; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15]
9134; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
9135; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm3, %xmm0
9136; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm22
9137; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9138; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9139; AVX512-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
9140; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
9141; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
9142; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm11, %zmm3, %zmm28
9143; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
9144; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
9145; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
9146; AVX512-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm26 # 16-byte Folded Reload
9147; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
9148; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
9149; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
9150; AVX512-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload
9151; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
9152; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
9153; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
9154; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm30, %zmm7, %zmm7
9155; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7]
9156; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
9157; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm14 = zmm20[2,3,2,3],zmm8[0,1,0,1]
9158; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9159; AVX512-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload
9160; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
9161; AVX512-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm20 # 64-byte Folded Reload
9162; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7]
9163; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm20 = zmm20[2,3,2,3,6,7,6,7]
9164; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm8))
9165; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9166; AVX512-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload
9167; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm30 = zmm8[2,3,2,3,6,7,6,7]
9168; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
9169; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm8 & (zmm30 ^ zmm20))
9170; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm19[0,1,0,1,4,5,4,5]
9171; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm20 = zmm17[0,1,0,1,4,5,4,5]
9172; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm5 ^ (zmm8 & (zmm20 ^ zmm5))
9173; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9174; AVX512-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
9175; AVX512-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm5 # 64-byte Folded Reload
9176; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm25 = zmm1[2,3,2,3,6,7,6,7]
9177; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm5[2,3,2,3,6,7,6,7]
9178; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm25 ^ (zmm8 & (zmm1 ^ zmm25))
9179; AVX512-FCP-NEXT:    vmovdqa64 %xmm16, %xmm4
9180; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
9181; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
9182; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
9183; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm3[2,3,2,3],zmm5[0,1,0,1]
9184; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
9185; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm5, %xmm5
9186; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
9187; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm14 ^ (zmm8 & (zmm10 ^ zmm14))
9188; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm21[2,3,2,3],zmm5[0,1,0,1]
9189; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
9190; AVX512-FCP-NEXT:    # ymm11 = mem[2,3,2,3]
9191; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9192; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm11
9193; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm29[2,3,2,3]
9194; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9195; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
9196; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm11 ^ (zmm8 & (zmm14 ^ zmm11))
9197; AVX512-FCP-NEXT:    vmovdqa64 %xmm23, %xmm0
9198; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7]
9199; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,1,0,1,2,0,0,1]
9200; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm11, %ymm16
9201; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
9202; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm12
9203; AVX512-FCP-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
9204; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm6[1,1,0,0,4,5,6,7]
9205; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm11, %ymm2
9206; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
9207; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
9208; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm15[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
9209; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [4,5,4,5,5,7,4,5]
9210; AVX512-FCP-NEXT:    vpermd %ymm15, %ymm17, %ymm15
9211; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
9212; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,ymm13[13],zero,zero,zero,zero,zero,zero,ymm13[14],zero,zero,zero,zero,zero,zero,ymm13[15],zero,zero,zero,zero,zero,zero,ymm13[16],zero,zero,zero,zero,zero,zero,ymm13[17],zero,zero
9213; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
9214; AVX512-FCP-NEXT:    vpermd %ymm13, %ymm17, %ymm13
9215; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm24[0,1,0,1]
9216; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm17 = ymm27[0,1,0,1]
9217; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm22[0,1,0,1]
9218; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,0]
9219; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm12[0,0,1,0]
9220; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
9221; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
9222; AVX512-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13
9223; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm8, %zmm8
9224; AVX512-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Folded Reload
9225; AVX512-FCP-NEXT:    # zmm13 = mem[2,3,2,3,6,7,6,7]
9226; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm30))
9227; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload
9228; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 32-byte Folded Reload
9229; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm6))
9230; AVX512-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
9231; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm18 = zmm18 | (zmm3 & mem)
9232; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm18 = zmm18 ^ (mem & (zmm18 ^ zmm17))
9233; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm31, %zmm5, %zmm3
9234; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm28[0,1,0,1,4,5,4,5]
9235; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm5 = zmm3 ^ (mem & (zmm5 ^ zmm3))
9236; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm20))
9237; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm26[0,1,0,1,4,5,4,5]
9238; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm6 = zmm9[0,1,0,1,4,5,4,5]
9239; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm3))
9240; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm16, %zmm3
9241; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm4 = zmm7[0,1,0,1,4,5,4,5]
9242; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm3 ^ (mem & (zmm4 ^ zmm3))
9243; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm6))
9244; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
9245; AVX512-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload
9246; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm15, %zmm6
9247; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[2,3,2,3,6,7,6,7]
9248; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm3))
9249; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm1))
9250; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
9251; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm21))
9252; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm10))
9253; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9254; AVX512-FCP-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
9255; AVX512-FCP-NEXT:    # zmm1 = zmm1[0,1,2,3],mem[2,3,2,3]
9256; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm1 & mem)
9257; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm14))
9258; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
9259; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 320(%rax)
9260; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 192(%rax)
9261; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 128(%rax)
9262; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
9263; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 256(%rax)
9264; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, 64(%rax)
9265; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, 384(%rax)
9266; AVX512-FCP-NEXT:    addq $1432, %rsp # imm = 0x598
9267; AVX512-FCP-NEXT:    vzeroupper
9268; AVX512-FCP-NEXT:    retq
9269;
9270; AVX512DQ-LABEL: store_i8_stride7_vf64:
9271; AVX512DQ:       # %bb.0:
9272; AVX512DQ-NEXT:    subq $1384, %rsp # imm = 0x568
9273; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm7
9274; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero,zero,zero,ymm7[18]
9275; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm2
9276; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero
9277; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm20
9278; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
9279; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9280; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm15
9281; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
9282; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm15, %ymm0
9283; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm23
9284; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm2
9285; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
9286; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
9287; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm26
9288; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm18
9289; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
9290; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9291; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm14
9292; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
9293; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm14, %ymm0
9294; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm27
9295; AVX512DQ-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9296; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm8
9297; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0]
9298; AVX512DQ-NEXT:    # ymm3 = mem[0,1,0,1]
9299; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm8, %ymm1
9300; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm17
9301; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
9302; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9303; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm10
9304; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
9305; AVX512DQ-NEXT:    # ymm6 = mem[0,1,0,1]
9306; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm10, %ymm0
9307; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm9
9308; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm9[23],zero,zero,zero,zero,ymm9[26],zero,ymm9[24],zero,zero,zero,zero,ymm9[27],zero,ymm9[25]
9309; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
9310; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9311; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
9312; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm9, %ymm0
9313; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm19
9314; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm10[23,u,u,u],zero,ymm10[26],zero,ymm10[24,u,u,u],zero,ymm10[27],zero
9315; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
9316; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9317; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm5
9318; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
9319; AVX512DQ-NEXT:    # ymm1 = mem[0,1,0,1]
9320; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm5, %ymm2
9321; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm4
9322; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero
9323; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
9324; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9325; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0]
9326; AVX512DQ-NEXT:    # ymm0 = mem[0,1,0,1]
9327; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm4, %ymm2
9328; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm21
9329; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27]
9330; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
9331; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9332; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm3
9333; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
9334; AVX512DQ-NEXT:    # ymm0 = mem[0,1,0,1]
9335; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm11
9336; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm25
9337; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm2
9338; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
9339; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm12, %zmm11
9340; AVX512DQ-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9341; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128]
9342; AVX512DQ-NEXT:    # ymm11 = mem[0,1,0,1]
9343; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm2, %ymm12
9344; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm13 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm3[23],zero,ymm3[23,24,25,26],zero,ymm3[24],zero,ymm3[30,31]
9345; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm12
9346; AVX512DQ-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9347; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
9348; AVX512DQ-NEXT:    vmovdqa 32(%rax), %ymm0
9349; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9350; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
9351; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
9352; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm12
9353; AVX512DQ-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9354; AVX512DQ-NEXT:    vpshufb %ymm6, %ymm7, %ymm6
9355; AVX512DQ-NEXT:    vmovdqa64 %ymm7, %ymm22
9356; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm7
9357; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm0
9358; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm7, %ymm7
9359; AVX512DQ-NEXT:    vpor %ymm6, %ymm7, %ymm0
9360; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9361; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm12
9362; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm12, %ymm6
9363; AVX512DQ-NEXT:    vmovdqa64 %ymm21, %ymm0
9364; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm15, %ymm7
9365; AVX512DQ-NEXT:    vmovdqa64 %ymm15, %ymm19
9366; AVX512DQ-NEXT:    vpor %ymm6, %ymm7, %ymm0
9367; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9368; AVX512DQ-NEXT:    vmovdqa64 %ymm25, %ymm0
9369; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm14, %ymm6
9370; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm8, %ymm7
9371; AVX512DQ-NEXT:    vmovdqa64 %ymm8, %ymm16
9372; AVX512DQ-NEXT:    vpor %ymm6, %ymm7, %ymm0
9373; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9374; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm0
9375; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9376; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm1
9377; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9378; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
9379; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm1, %xmm6
9380; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
9381; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm7
9382; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm21
9383; AVX512DQ-NEXT:    vpor %xmm6, %xmm7, %xmm0
9384; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9385; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm0
9386; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm15
9387; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
9388; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm15, %xmm7
9389; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm13 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
9390; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm0, %xmm8
9391; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm30
9392; AVX512DQ-NEXT:    vpor %xmm7, %xmm8, %xmm0
9393; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9394; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0]
9395; AVX512DQ-NEXT:    vmovdqa 32(%rax), %xmm0
9396; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9397; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5,5,6]
9398; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
9399; AVX512DQ-NEXT:    vpermi2d %zmm7, %zmm8, %zmm28
9400; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm0
9401; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm14
9402; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
9403; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm7
9404; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm24
9405; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm18
9406; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
9407; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm14, %xmm8
9408; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm29
9409; AVX512DQ-NEXT:    vporq %xmm7, %xmm8, %xmm31
9410; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm0
9411; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm4, %ymm7
9412; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm0
9413; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm5, %ymm8
9414; AVX512DQ-NEXT:    vpor %ymm7, %ymm8, %ymm0
9415; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9416; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
9417; AVX512DQ-NEXT:    # ymm1 = mem[0,1,0,1]
9418; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm4, %ymm0
9419; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm23
9420; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
9421; AVX512DQ-NEXT:    # ymm7 = mem[0,1,0,1]
9422; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm5, %ymm1
9423; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
9424; AVX512DQ-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9425; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18]
9426; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm9[0,1,14],zero,ymm9[12,13,0,1,14,15],zero,ymm9[3,12,13,2,3,16],zero,ymm9[30,31,28,29,16,17],zero,ymm9[31,18,19,28,29,18],zero
9427; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
9428; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9429; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
9430; AVX512DQ-NEXT:    # ymm0 = mem[0,1,0,1]
9431; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm10, %ymm1
9432; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
9433; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm9, %ymm4
9434; AVX512DQ-NEXT:    vpor %ymm1, %ymm4, %ymm1
9435; AVX512DQ-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9436; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
9437; AVX512DQ-NEXT:    # ymm1 = mem[0,1,0,1]
9438; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm4
9439; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
9440; AVX512DQ-NEXT:    # ymm9 = mem[0,1,0,1]
9441; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm3, %ymm5
9442; AVX512DQ-NEXT:    vpor %ymm4, %ymm5, %ymm4
9443; AVX512DQ-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9444; AVX512DQ-NEXT:    vmovdqa64 %ymm27, %ymm4
9445; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
9446; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm4
9447; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
9448; AVX512DQ-NEXT:    vpor %ymm3, %ymm2, %ymm2
9449; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9450; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm3
9451; AVX512DQ-NEXT:    vpshufb %xmm6, %xmm3, %xmm2
9452; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm25
9453; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm4
9454; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm4, %xmm3
9455; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm27
9456; AVX512DQ-NEXT:    vpor %xmm2, %xmm3, %xmm2
9457; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9458; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm3
9459; AVX512DQ-NEXT:    vpshufb %xmm11, %xmm3, %xmm2
9460; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm17
9461; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm13
9462; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm3
9463; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm13, %xmm3
9464; AVX512DQ-NEXT:    vpor %xmm2, %xmm3, %xmm2
9465; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9466; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm11
9467; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm2
9468; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm11, %xmm2
9469; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm10
9470; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm3
9471; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm10, %xmm3
9472; AVX512DQ-NEXT:    vpor %xmm2, %xmm3, %xmm2
9473; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9474; AVX512DQ-NEXT:    vmovdqa64 %ymm20, %ymm4
9475; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero,ymm4[27],zero,ymm4[25]
9476; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm3
9477; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
9478; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
9479; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9480; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero
9481; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm4, %ymm2
9482; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm29
9483; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm2
9484; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero
9485; AVX512DQ-NEXT:    vmovdqa64 %ymm23, %ymm3
9486; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
9487; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm26
9488; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm12[23],zero,ymm12[21,22,23,26],zero,ymm12[24],zero,ymm12[28,29,26,27]
9489; AVX512DQ-NEXT:    vpshufb %ymm7, %ymm12, %ymm2
9490; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm23
9491; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm2
9492; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
9493; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
9494; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
9495; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9496; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9497; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31]
9498; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm1, %ymm1
9499; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
9500; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9501; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm8
9502; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
9503; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23]
9504; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm8[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
9505; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm2, %zmm16
9506; AVX512DQ-NEXT:    vmovdqa (%rax), %xmm5
9507; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,5,5,6]
9508; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
9509; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
9510; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
9511; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
9512; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm8, %ymm3
9513; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm19
9514; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm4
9515; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7]
9516; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
9517; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
9518; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm24
9519; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9520; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm6 = zmm0[2,3,2,3],zmm3[0,1,0,1]
9521; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9522; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9523; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
9524; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
9525; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
9526; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm21
9527; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9528; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm0[2,3,2,3],zmm3[0,1,0,1]
9529; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm2
9530; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7]
9531; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
9532; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm12, %xmm12
9533; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm20
9534; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9535; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm30 = zmm0[2,3,2,3],zmm12[0,1,0,1]
9536; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
9537; AVX512DQ-NEXT:    vmovdqa64 %ymm0, %ymm18
9538; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
9539; AVX512DQ-NEXT:    vmovdqa64 %xmm17, %xmm3
9540; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15]
9541; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
9542; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm8, %xmm12
9543; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
9544; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9545; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm8
9546; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15]
9547; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm4
9548; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm7
9549; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
9550; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
9551; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm15, %xmm15
9552; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
9553; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9554; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm17
9555; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15]
9556; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15]
9557; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
9558; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm14, %xmm14
9559; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
9560; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm31, %zmm31
9561; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9562; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm0, %ymm1
9563; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
9564; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,5,7]
9565; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2]
9566; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9
9567; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm1, %zmm9
9568; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
9569; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm22 = zmm6 ^ (zmm1 & (zmm22 ^ zmm6))
9570; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
9571; AVX512DQ-NEXT:    # ymm6 = mem[2,3,2,3]
9572; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9573; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm6
9574; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
9575; AVX512DQ-NEXT:    # ymm27 = mem[2,3,2,3]
9576; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9577; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm27, %zmm0, %zmm27
9578; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm27 = zmm6 ^ (zmm1 & (zmm27 ^ zmm6))
9579; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
9580; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm0
9581; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
9582; AVX512DQ-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
9583; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
9584; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm0
9585; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
9586; AVX512DQ-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 16-byte Folded Reload
9587; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
9588; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm0
9589; AVX512DQ-NEXT:    vpshufb %xmm0, %xmm10, %xmm10
9590; AVX512DQ-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 16-byte Folded Reload
9591; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm5[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
9592; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[1,1,0,0,4,5,6,7]
9593; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
9594; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm5, %zmm5
9595; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
9596; AVX512DQ-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload
9597; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
9598; AVX512DQ-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload
9599; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7]
9600; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm13 = zmm13[2,3,2,3,6,7,6,7]
9601; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm11))
9602; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
9603; AVX512DQ-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload
9604; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7]
9605; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
9606; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm21 & (zmm11 ^ zmm13))
9607; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5]
9608; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm7 = zmm17[0,1,0,1,4,5,4,5]
9609; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm8 ^ (zmm21 & (zmm7 ^ zmm8))
9610; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm15[0,1,0,1]
9611; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
9612; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm14[0,1,0,1]
9613; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm18[2,3,2,3]
9614; AVX512DQ-NEXT:    vpshuflw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9615; AVX512DQ-NEXT:    # xmm15 = mem[1,1,0,0,4,5,6,7]
9616; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0]
9617; AVX512DQ-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm20 # 64-byte Folded Reload
9618; AVX512DQ-NEXT:    vporq %zmm26, %zmm23, %zmm17
9619; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm18 = zmm20[2,3,2,3,6,7,6,7]
9620; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm17 = zmm17[2,3,2,3,6,7,6,7]
9621; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm18 ^ (zmm21 & (zmm17 ^ zmm18))
9622; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload
9623; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 32-byte Folded Reload
9624; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm8))
9625; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 32-byte Folded Reload
9626; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm19 = zmm19 | (zmm8 & mem)
9627; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm19 = zmm19 ^ (mem & (zmm19 ^ zmm12))
9628; AVX512DQ-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload
9629; AVX512DQ-NEXT:    # zmm8 = mem[2,3,2,3,6,7,6,7]
9630; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm11))
9631; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm15[0,0,1,0]
9632; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm14, %zmm11
9633; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm30))
9634; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm22))
9635; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm0 = zmm31[0,1,0,1,4,5,4,5]
9636; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm28 ^ (mem & (zmm0 ^ zmm28))
9637; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm7))
9638; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
9639; AVX512DQ-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload
9640; AVX512DQ-NEXT:    # zmm3 = zmm3[0,1,2,3],mem[2,3,2,3]
9641; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm3 & mem)
9642; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm27))
9643; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5]
9644; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm3 = zmm6[0,1,0,1,4,5,4,5]
9645; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm1))
9646; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm1 = zmm10[0,1,0,1,4,5,4,5]
9647; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[0,0,1,0,4,4,5,4]
9648; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm1))
9649; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm3))
9650; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9651; AVX512DQ-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
9652; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7]
9653; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm1))
9654; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm17))
9655; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
9656; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 128(%rax)
9657; AVX512DQ-NEXT:    vmovdqa64 %zmm5, (%rax)
9658; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 320(%rax)
9659; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 256(%rax)
9660; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 192(%rax)
9661; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 384(%rax)
9662; AVX512DQ-NEXT:    vmovdqa64 %zmm19, 64(%rax)
9663; AVX512DQ-NEXT:    addq $1384, %rsp # imm = 0x568
9664; AVX512DQ-NEXT:    vzeroupper
9665; AVX512DQ-NEXT:    retq
9666;
9667; AVX512DQ-FCP-LABEL: store_i8_stride7_vf64:
9668; AVX512DQ-FCP:       # %bb.0:
9669; AVX512DQ-FCP-NEXT:    subq $1432, %rsp # imm = 0x598
9670; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm3
9671; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero
9672; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
9673; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9674; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
9675; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
9676; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
9677; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm25
9678; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
9679; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9680; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm2[28],zero,ymm2[30,31,30,31],zero,ymm2[29],zero,ymm2[31,28,29]
9681; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128]
9682; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
9683; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm1
9684; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm16
9685; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm27
9686; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
9687; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9688; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm2
9689; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero
9690; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm15
9691; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
9692; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
9693; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm15, %ymm1
9694; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm29
9695; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
9696; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9697; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm15[30],zero,ymm15[28,u,u,u],zero,ymm15[31],zero,ymm15[29,u]
9698; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
9699; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
9700; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
9701; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm31
9702; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm26
9703; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
9704; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9705; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm2
9706; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29]
9707; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
9708; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
9709; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
9710; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm1
9711; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm4, %ymm21
9712; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
9713; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9714; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm3[27,u,u,u],zero,ymm3[30],zero,ymm3[28,u,u,u],zero,ymm3[31],zero
9715; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm23
9716; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
9717; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
9718; AVX512DQ-FCP-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
9719; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm20
9720; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm19
9721; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
9722; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9723; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
9724; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %ymm1
9725; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9726; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
9727; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
9728; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
9729; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9730; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm6
9731; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18]
9732; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm5
9733; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[0,1,14],zero,ymm5[12,13,0,1,14,15],zero,ymm5[3,12,13,2,3,16],zero,ymm5[30,31,28,29,16,17],zero,ymm5[31,18,19,28,29,18],zero
9734; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
9735; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9736; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm3
9737; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero
9738; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm2
9739; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,0,1,14],zero,ymm2[14,15,0,1,14,15],zero,ymm2[13,14,15,16,17,16],zero,ymm2[30,31,30,31,16,17],zero,ymm2[31,28,29,30,31]
9740; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
9741; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9742; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm1
9743; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero
9744; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm14
9745; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm14[13,u,u,u,u,u],zero,ymm14[14,u,u,u,u,u],zero,ymm14[15,u,u,u,u,u],zero,ymm14[16,u,u,u,u,u],zero,ymm14[17,u,u,u]
9746; AVX512DQ-FCP-NEXT:    vpor %ymm4, %ymm7, %ymm4
9747; AVX512DQ-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9748; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm9
9749; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9750; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm0
9751; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9752; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
9753; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm0, %xmm7
9754; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
9755; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm9, %xmm9
9756; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm9, %xmm0
9757; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9758; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm0
9759; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm9
9760; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9761; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
9762; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm9
9763; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
9764; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm11
9765; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm28
9766; AVX512DQ-FCP-NEXT:    vpor %xmm9, %xmm11, %xmm0
9767; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9768; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm9
9769; AVX512DQ-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9770; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm0
9771; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9772; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
9773; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm9, %xmm9
9774; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
9775; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm0, %xmm12
9776; AVX512DQ-FCP-NEXT:    vpor %xmm9, %xmm12, %xmm0
9777; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9778; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm0
9779; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
9780; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm24
9781; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
9782; AVX512DQ-FCP-NEXT:    vpshufb %xmm10, %xmm0, %xmm10
9783; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm17
9784; AVX512DQ-FCP-NEXT:    vpor %xmm7, %xmm10, %xmm0
9785; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9786; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm10
9787; AVX512DQ-FCP-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
9788; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm9
9789; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm9, %xmm8
9790; AVX512DQ-FCP-NEXT:    vpor %xmm4, %xmm8, %xmm0
9791; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9792; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm8
9793; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm11
9794; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm7
9795; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm7, %xmm13
9796; AVX512DQ-FCP-NEXT:    vporq %xmm11, %xmm13, %xmm30
9797; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm0
9798; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm11
9799; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
9800; AVX512DQ-FCP-NEXT:    # ymm12 = mem[0,1,0,1]
9801; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm6, %ymm13
9802; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
9803; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9804; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm0
9805; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm6, %ymm11
9806; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
9807; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
9808; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm13
9809; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm22
9810; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
9811; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9812; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm0
9813; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm11
9814; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
9815; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
9816; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm13
9817; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm29
9818; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
9819; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9820; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm0
9821; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm11
9822; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
9823; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
9824; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm13
9825; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm25
9826; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm0
9827; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm11
9828; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
9829; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
9830; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm13
9831; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm18
9832; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
9833; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9834; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm20, %ymm0
9835; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm11
9836; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
9837; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
9838; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm13
9839; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm16
9840; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm13, %zmm0
9841; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9842; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero,zero
9843; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm5[28],zero,ymm5[30,31,30,31],zero,ymm5[29],zero,ymm5[31,28,29]
9844; AVX512DQ-FCP-NEXT:    vporq %ymm6, %ymm5, %ymm20
9845; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero
9846; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm3[30],zero,ymm3[28,u,u,u],zero,ymm3[31],zero,ymm3[29,u]
9847; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm0
9848; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9849; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29]
9850; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm14[27,u,u,u],zero,ymm14[30],zero,ymm14[28,u,u,u],zero,ymm14[31],zero
9851; AVX512DQ-FCP-NEXT:    vporq %ymm1, %ymm0, %ymm21
9852; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero,zero,zero,zero
9853; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm2
9854; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,0,1,14],zero,ymm2[14,15,0,1,14,15],zero,ymm2[13,14,15,16,17,16],zero,ymm2[30,31,30,31,16,17],zero,ymm2[31,28,29,30,31]
9855; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
9856; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9857; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm0
9858; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm15, %ymm0
9859; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm1
9860; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
9861; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9862; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm3
9863; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18]
9864; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9865; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero
9866; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
9867; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9868; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm0
9869; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm22, %ymm1
9870; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
9871; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm1, %ymm29
9872; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm23, %ymm2
9873; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm18, %ymm0
9874; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
9875; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm3
9876; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm1
9877; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
9878; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
9879; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9880; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero
9881; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u]
9882; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
9883; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9884; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %xmm1
9885; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [2,2,3,3,2,2,3,3]
9886; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
9887; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,5,6]
9888; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm23
9889; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm3, %ymm0
9890; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
9891; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm15
9892; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,ymm15[13],zero,zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero
9893; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm18
9894; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
9895; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
9896; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
9897; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm27
9898; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9899; AVX512DQ-FCP-NEXT:    vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload
9900; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm1
9901; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15]
9902; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
9903; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9904; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm19
9905; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm4
9906; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm17, %xmm1
9907; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
9908; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
9909; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
9910; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm24
9911; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9912; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm5
9913; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15]
9914; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
9915; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9916; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm17
9917; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %xmm6
9918; AVX512DQ-FCP-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm6[0,1,2,3,4,5,5,6]
9919; AVX512DQ-FCP-NEXT:    vpermd %ymm14, %ymm3, %ymm31
9920; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15]
9921; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
9922; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm3, %xmm0
9923; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm0, %ymm22
9924; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9925; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9926; AVX512DQ-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
9927; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
9928; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
9929; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm11, %zmm3, %zmm28
9930; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
9931; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
9932; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
9933; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm26 # 16-byte Folded Reload
9934; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
9935; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
9936; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
9937; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload
9938; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
9939; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
9940; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
9941; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm30, %zmm7, %zmm7
9942; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7]
9943; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
9944; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm14 = zmm20[2,3,2,3],zmm8[0,1,0,1]
9945; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9946; AVX512DQ-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload
9947; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
9948; AVX512DQ-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm20 # 64-byte Folded Reload
9949; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7]
9950; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm20 = zmm20[2,3,2,3,6,7,6,7]
9951; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm8))
9952; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9953; AVX512DQ-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload
9954; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm30 = zmm8[2,3,2,3,6,7,6,7]
9955; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
9956; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm8 & (zmm30 ^ zmm20))
9957; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm19[0,1,0,1,4,5,4,5]
9958; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm20 = zmm17[0,1,0,1,4,5,4,5]
9959; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm20 = zmm5 ^ (zmm8 & (zmm20 ^ zmm5))
9960; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9961; AVX512DQ-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
9962; AVX512DQ-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm5 # 64-byte Folded Reload
9963; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm25 = zmm1[2,3,2,3,6,7,6,7]
9964; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm5[2,3,2,3,6,7,6,7]
9965; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm25 ^ (zmm8 & (zmm1 ^ zmm25))
9966; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm16, %xmm4
9967; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
9968; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
9969; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
9970; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm10 = zmm3[2,3,2,3],zmm5[0,1,0,1]
9971; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
9972; AVX512DQ-FCP-NEXT:    vpshufb %xmm11, %xmm5, %xmm5
9973; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
9974; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm14 ^ (zmm8 & (zmm10 ^ zmm14))
9975; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm21[2,3,2,3],zmm5[0,1,0,1]
9976; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
9977; AVX512DQ-FCP-NEXT:    # ymm11 = mem[2,3,2,3]
9978; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9979; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm11
9980; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm29[2,3,2,3]
9981; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9982; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
9983; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm11 ^ (zmm8 & (zmm14 ^ zmm11))
9984; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm23, %xmm0
9985; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7]
9986; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm11 = [0,1,0,1,2,0,0,1]
9987; AVX512DQ-FCP-NEXT:    vpermd %ymm8, %ymm11, %ymm16
9988; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
9989; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm12
9990; AVX512DQ-FCP-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
9991; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm6[1,1,0,0,4,5,6,7]
9992; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm11, %ymm2
9993; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm11 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
9994; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
9995; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm15[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
9996; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [4,5,4,5,5,7,4,5]
9997; AVX512DQ-FCP-NEXT:    vpermd %ymm15, %ymm17, %ymm15
9998; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
9999; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = zero,ymm13[13],zero,zero,zero,zero,zero,zero,ymm13[14],zero,zero,zero,zero,zero,zero,ymm13[15],zero,zero,zero,zero,zero,zero,ymm13[16],zero,zero,zero,zero,zero,zero,ymm13[17],zero,zero
10000; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
10001; AVX512DQ-FCP-NEXT:    vpermd %ymm13, %ymm17, %ymm13
10002; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm24[0,1,0,1]
10003; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm17 = ymm27[0,1,0,1]
10004; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm22[0,1,0,1]
10005; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,0]
10006; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm12[0,0,1,0]
10007; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
10008; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
10009; AVX512DQ-FCP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13
10010; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm8, %zmm8
10011; AVX512DQ-FCP-NEXT:    vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Folded Reload
10012; AVX512DQ-FCP-NEXT:    # zmm13 = mem[2,3,2,3,6,7,6,7]
10013; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm30))
10014; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload
10015; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 32-byte Folded Reload
10016; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm6))
10017; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
10018; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm18 = zmm18 | (zmm3 & mem)
10019; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm18 = zmm18 ^ (mem & (zmm18 ^ zmm17))
10020; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm31, %zmm5, %zmm3
10021; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm28[0,1,0,1,4,5,4,5]
10022; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm5 = zmm3 ^ (mem & (zmm5 ^ zmm3))
10023; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm20))
10024; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm26[0,1,0,1,4,5,4,5]
10025; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm6 = zmm9[0,1,0,1,4,5,4,5]
10026; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm3))
10027; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm16, %zmm3
10028; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm4 = zmm7[0,1,0,1,4,5,4,5]
10029; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm3 ^ (mem & (zmm4 ^ zmm3))
10030; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm6))
10031; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
10032; AVX512DQ-FCP-NEXT:    vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload
10033; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm15, %zmm6
10034; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[2,3,2,3,6,7,6,7]
10035; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm3))
10036; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm1))
10037; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
10038; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm21))
10039; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm10))
10040; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10041; AVX512DQ-FCP-NEXT:    vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
10042; AVX512DQ-FCP-NEXT:    # zmm1 = zmm1[0,1,2,3],mem[2,3,2,3]
10043; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm1 & mem)
10044; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm14))
10045; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10046; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 320(%rax)
10047; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 192(%rax)
10048; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 128(%rax)
10049; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
10050; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 256(%rax)
10051; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, 64(%rax)
10052; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 384(%rax)
10053; AVX512DQ-FCP-NEXT:    addq $1432, %rsp # imm = 0x598
10054; AVX512DQ-FCP-NEXT:    vzeroupper
10055; AVX512DQ-FCP-NEXT:    retq
10056;
10057; AVX512BW-LABEL: store_i8_stride7_vf64:
10058; AVX512BW:       # %bb.0:
10059; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10060; AVX512BW-NEXT:    vmovdqa (%rax), %ymm13
10061; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm26 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
10062; AVX512BW-NEXT:    vpshufb %ymm26, %ymm13, %ymm0
10063; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
10064; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
10065; AVX512BW-NEXT:    vpermw %ymm13, %ymm1, %ymm1
10066; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
10067; AVX512BW-NEXT:    vmovdqa (%r9), %ymm9
10068; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm17 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
10069; AVX512BW-NEXT:    vpshufb %ymm17, %ymm9, %ymm1
10070; AVX512BW-NEXT:    vmovdqa (%r8), %ymm10
10071; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm21 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
10072; AVX512BW-NEXT:    vpshufb %ymm21, %ymm10, %ymm2
10073; AVX512BW-NEXT:    vpor %ymm1, %ymm2, %ymm2
10074; AVX512BW-NEXT:    vmovdqa (%r9), %xmm1
10075; AVX512BW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10076; AVX512BW-NEXT:    vmovdqa (%r8), %xmm12
10077; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15]
10078; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
10079; AVX512BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
10080; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm4, %zmm8
10081; AVX512BW-NEXT:    movabsq $2323999253380730912, %r10 # imm = 0x2040810204081020
10082; AVX512BW-NEXT:    kmovq %r10, %k1
10083; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm8 {%k1}
10084; AVX512BW-NEXT:    vmovdqa (%rdx), %ymm14
10085; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
10086; AVX512BW-NEXT:    vpshufb %ymm0, %ymm14, %ymm2
10087; AVX512BW-NEXT:    vmovdqa (%rcx), %ymm15
10088; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
10089; AVX512BW-NEXT:    vpshufb %ymm20, %ymm15, %ymm4
10090; AVX512BW-NEXT:    vpor %ymm2, %ymm4, %ymm2
10091; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm4
10092; AVX512BW-NEXT:    vmovdqa (%rcx), %xmm5
10093; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
10094; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
10095; AVX512BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
10096; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm6, %zmm22
10097; AVX512BW-NEXT:    vmovdqa64 (%rdi), %ymm18
10098; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm24 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
10099; AVX512BW-NEXT:    vpshufb %ymm24, %ymm18, %ymm2
10100; AVX512BW-NEXT:    vmovdqa64 (%rsi), %ymm19
10101; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm25 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
10102; AVX512BW-NEXT:    vpshufb %ymm25, %ymm19, %ymm6
10103; AVX512BW-NEXT:    vpor %ymm2, %ymm6, %ymm2
10104; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm6
10105; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm7
10106; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm23 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
10107; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm23 = xmm23[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
10108; AVX512BW-NEXT:    vpermq {{.*#+}} ymm23 = ymm23[0,1,0,1]
10109; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm23, %zmm3
10110; AVX512BW-NEXT:    movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306
10111; AVX512BW-NEXT:    kmovq %r10, %k1
10112; AVX512BW-NEXT:    vmovdqu8 %zmm22, %zmm3 {%k1}
10113; AVX512BW-NEXT:    movabsq $4066998693416279096, %r10 # imm = 0x3870E1C3870E1C38
10114; AVX512BW-NEXT:    kmovq %r10, %k1
10115; AVX512BW-NEXT:    vmovdqu8 %zmm8, %zmm3 {%k1}
10116; AVX512BW-NEXT:    vmovdqa64 32(%rdx), %ymm29
10117; AVX512BW-NEXT:    vpshufb %ymm0, %ymm29, %ymm0
10118; AVX512BW-NEXT:    vmovdqa64 32(%rcx), %ymm30
10119; AVX512BW-NEXT:    vpshufb %ymm20, %ymm30, %ymm8
10120; AVX512BW-NEXT:    vpor %ymm0, %ymm8, %ymm0
10121; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
10122; AVX512BW-NEXT:    vpshufb %ymm20, %ymm29, %ymm8
10123; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
10124; AVX512BW-NEXT:    vpshufb %ymm22, %ymm30, %ymm23
10125; AVX512BW-NEXT:    vporq %ymm8, %ymm23, %ymm8
10126; AVX512BW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
10127; AVX512BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
10128; AVX512BW-NEXT:    vmovdqa64 32(%rsi), %ymm28
10129; AVX512BW-NEXT:    vmovdqa64 32(%rdi), %ymm16
10130; AVX512BW-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm16[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
10131; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[0,0,1,1,4,4,5,5]
10132; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm23 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6]
10133; AVX512BW-NEXT:    movl $676341840, %r10d # imm = 0x28502850
10134; AVX512BW-NEXT:    kmovd %r10d, %k1
10135; AVX512BW-NEXT:    vpshufb %ymm23, %ymm28, %ymm8 {%k1}
10136; AVX512BW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
10137; AVX512BW-NEXT:    vpshufb %ymm24, %ymm16, %ymm24
10138; AVX512BW-NEXT:    vpshufb %ymm25, %ymm28, %ymm25
10139; AVX512BW-NEXT:    vporq %ymm24, %ymm25, %ymm24
10140; AVX512BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm24, %zmm8
10141; AVX512BW-NEXT:    movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830
10142; AVX512BW-NEXT:    kmovq %r10, %k2
10143; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm8 {%k2}
10144; AVX512BW-NEXT:    vmovdqa64 32(%r9), %ymm31
10145; AVX512BW-NEXT:    vpshufb %ymm17, %ymm31, %ymm17
10146; AVX512BW-NEXT:    vmovdqa 32(%r8), %ymm1
10147; AVX512BW-NEXT:    vpshufb %ymm21, %ymm1, %ymm21
10148; AVX512BW-NEXT:    vporq %ymm17, %ymm21, %ymm17
10149; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
10150; AVX512BW-NEXT:    vpshufb %ymm24, %ymm1, %ymm21
10151; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
10152; AVX512BW-NEXT:    vpshufb %ymm25, %ymm31, %ymm27
10153; AVX512BW-NEXT:    vporq %ymm21, %ymm27, %ymm21
10154; AVX512BW-NEXT:    vpermq {{.*#+}} ymm21 = ymm21[2,3,2,3]
10155; AVX512BW-NEXT:    vinserti64x4 $1, %ymm21, %zmm17, %zmm21
10156; AVX512BW-NEXT:    vmovdqa64 32(%rax), %ymm17
10157; AVX512BW-NEXT:    vpshufb %ymm26, %ymm17, %ymm27
10158; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm26 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28]
10159; AVX512BW-NEXT:    vpermw %ymm17, %ymm26, %ymm11
10160; AVX512BW-NEXT:    vinserti64x4 $1, %ymm11, %zmm27, %zmm11
10161; AVX512BW-NEXT:    movabsq $145249953336295682, %r10 # imm = 0x204081020408102
10162; AVX512BW-NEXT:    kmovq %r10, %k3
10163; AVX512BW-NEXT:    vmovdqu8 %zmm11, %zmm21 {%k3}
10164; AVX512BW-NEXT:    movabsq $-4357498600088870461, %r10 # imm = 0xC3870E1C3870E1C3
10165; AVX512BW-NEXT:    kmovq %r10, %k3
10166; AVX512BW-NEXT:    vmovdqu8 %zmm21, %zmm8 {%k3}
10167; AVX512BW-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm16[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
10168; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm11 = ymm11[2,2,3,3,6,6,7,7]
10169; AVX512BW-NEXT:    movl $338170920, %r10d # imm = 0x14281428
10170; AVX512BW-NEXT:    kmovd %r10d, %k4
10171; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm27 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14]
10172; AVX512BW-NEXT:    vpshufb %ymm27, %ymm28, %ymm11 {%k4}
10173; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128]
10174; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
10175; AVX512BW-NEXT:    vpshufb %ymm2, %ymm28, %ymm21
10176; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm28 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
10177; AVX512BW-NEXT:    vpshufb %ymm28, %ymm16, %ymm16
10178; AVX512BW-NEXT:    vporq %ymm21, %ymm16, %ymm16
10179; AVX512BW-NEXT:    vinserti64x4 $1, %ymm11, %zmm16, %zmm16
10180; AVX512BW-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm29[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
10181; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm21 = ymm11[0,2,3,3,4,6,7,7]
10182; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm11 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12]
10183; AVX512BW-NEXT:    vpshufb %ymm11, %ymm30, %ymm0
10184; AVX512BW-NEXT:    vmovdqu8 %ymm21, %ymm0 {%k1}
10185; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm21 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm29[24,25],zero,ymm29[23],zero,ymm29[21,22,23,26],zero,ymm29[24],zero,ymm29[28,29,26,27]
10186; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm29 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm30[25],zero,ymm30[23],zero,zero,zero,zero,ymm30[26],zero,ymm30[24],zero,zero,zero,zero
10187; AVX512BW-NEXT:    vporq %ymm21, %ymm29, %ymm21
10188; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm21, %zmm0
10189; AVX512BW-NEXT:    vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7]
10190; AVX512BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7]
10191; AVX512BW-NEXT:    movabsq $1742999440035548184, %r10 # imm = 0x183060C183060C18
10192; AVX512BW-NEXT:    kmovq %r10, %k3
10193; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm16 {%k3}
10194; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm0
10195; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm0[4,5,6,7]
10196; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm1[23],zero,zmm1[23,24,25,26],zero,zmm1[24],zero,zmm1[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm1[59],zero,zero,zero,zero,zmm1[62],zero,zmm1[60],zero,zero,zero,zero,zmm1[63],zero
10197; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm29
10198; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm31[0,1,2,3],zmm29[4,5,6,7]
10199; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm21 = zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm21[25],zero,zmm21[23],zero,zero,zero,zero,zmm21[26],zero,zmm21[24],zero,zero,zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm21[59],zero,zero,zero,zero,zmm21[62],zero,zmm21[60],zero,zero,zero,zero,zmm21[63],zero,zmm21[61]
10200; AVX512BW-NEXT:    vporq %zmm1, %zmm21, %zmm1
10201; AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7]
10202; AVX512BW-NEXT:    movabsq $6971997760142192736, %r10 # imm = 0x60C183060C183060
10203; AVX512BW-NEXT:    kmovq %r10, %k3
10204; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm16 {%k3}
10205; AVX512BW-NEXT:    vmovdqa64 (%rax), %zmm21
10206; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63]
10207; AVX512BW-NEXT:    vpermi2w %zmm21, %zmm17, %zmm1
10208; AVX512BW-NEXT:    movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081
10209; AVX512BW-NEXT:    kmovq %rax, %k5
10210; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm16 {%k5}
10211; AVX512BW-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm18[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
10212; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5]
10213; AVX512BW-NEXT:    vpshufb %ymm23, %ymm19, %ymm1 {%k1}
10214; AVX512BW-NEXT:    vpshufb %ymm2, %ymm19, %ymm2
10215; AVX512BW-NEXT:    vpshufb %ymm28, %ymm18, %ymm23
10216; AVX512BW-NEXT:    vporq %ymm2, %ymm23, %ymm2
10217; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
10218; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm2
10219; AVX512BW-NEXT:    vinserti64x4 $1, %ymm14, %zmm2, %zmm2
10220; AVX512BW-NEXT:    vpshufb %zmm20, %zmm2, %zmm2
10221; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm20
10222; AVX512BW-NEXT:    vinserti64x4 $1, %ymm15, %zmm20, %zmm20
10223; AVX512BW-NEXT:    vpshufb %zmm22, %zmm20, %zmm20
10224; AVX512BW-NEXT:    vporq %zmm2, %zmm20, %zmm2
10225; AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7]
10226; AVX512BW-NEXT:    vpermq {{.*#+}} zmm20 = zmm2[2,3,2,3,6,7,6,7]
10227; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm20 {%k3}
10228; AVX512BW-NEXT:    vmovdqa64 32(%r9), %xmm22
10229; AVX512BW-NEXT:    vinserti64x4 $1, %ymm10, %zmm29, %zmm1
10230; AVX512BW-NEXT:    vpshufb %zmm24, %zmm1, %zmm1
10231; AVX512BW-NEXT:    vmovdqa64 32(%r8), %xmm23
10232; AVX512BW-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm0
10233; AVX512BW-NEXT:    vpshufb %zmm25, %zmm0, %zmm2
10234; AVX512BW-NEXT:    vmovdqa 32(%rdx), %xmm0
10235; AVX512BW-NEXT:    vporq %zmm1, %zmm2, %zmm1
10236; AVX512BW-NEXT:    vmovdqa 32(%rcx), %xmm2
10237; AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7]
10238; AVX512BW-NEXT:    vinserti64x4 $1, %ymm13, %zmm21, %zmm13
10239; AVX512BW-NEXT:    vpermw %zmm13, %zmm26, %zmm24
10240; AVX512BW-NEXT:    movabsq $1161999626690365456, %rax # imm = 0x1020408102040810
10241; AVX512BW-NEXT:    kmovq %rax, %k5
10242; AVX512BW-NEXT:    vmovdqu8 %zmm24, %zmm1 {%k5}
10243; AVX512BW-NEXT:    vmovdqa64 32(%rdi), %xmm24
10244; AVX512BW-NEXT:    movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C
10245; AVX512BW-NEXT:    kmovq %rax, %k5
10246; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm20 {%k5}
10247; AVX512BW-NEXT:    vmovdqa 32(%rsi), %xmm1
10248; AVX512BW-NEXT:    vpshufhw {{.*#+}} ymm18 = ymm18[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
10249; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm25 = ymm18[2,2,3,3,6,6,7,7]
10250; AVX512BW-NEXT:    vpshufb %ymm27, %ymm19, %ymm25 {%k4}
10251; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm19 = xmm24[0],xmm1[0],xmm24[1],xmm1[1],xmm24[2],xmm1[2],xmm24[3],xmm1[3],xmm24[4],xmm1[4],xmm24[5],xmm1[5],xmm24[6],xmm1[6],xmm24[7],xmm1[7]
10252; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm18 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
10253; AVX512BW-NEXT:    vpshufb %xmm18, %xmm19, %xmm19
10254; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm25 = zmm25[2,3,2,3],zmm19[0,1,0,1]
10255; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm26 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
10256; AVX512BW-NEXT:    vpshufb %ymm11, %ymm15, %ymm11
10257; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
10258; AVX512BW-NEXT:    vpshufb %xmm19, %xmm26, %xmm15
10259; AVX512BW-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
10260; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm14 = ymm14[0,2,3,3,4,6,7,7]
10261; AVX512BW-NEXT:    vmovdqu8 %ymm14, %ymm11 {%k1}
10262; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm14 = zmm11[2,3,2,3],zmm15[0,1,0,1]
10263; AVX512BW-NEXT:    vmovdqu8 %zmm25, %zmm14 {%k2}
10264; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7]
10265; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm15 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
10266; AVX512BW-NEXT:    vpshufb %xmm15, %xmm11, %xmm11
10267; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm9[27],zero,zero,zero,zero,ymm9[30],zero,ymm9[28],zero,zero,zero,zero,ymm9[31],zero
10268; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[27],zero,zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero,ymm10[29]
10269; AVX512BW-NEXT:    vpor %ymm9, %ymm10, %ymm9
10270; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm9 = zmm9[2,3,2,3],zmm11[0,1,0,1]
10271; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm10 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16]
10272; AVX512BW-NEXT:    vpermw %zmm21, %zmm10, %zmm10
10273; AVX512BW-NEXT:    movabsq $580999813345182728, %rax # imm = 0x810204081020408
10274; AVX512BW-NEXT:    kmovq %rax, %k1
10275; AVX512BW-NEXT:    vmovdqu8 %zmm10, %zmm9 {%k1}
10276; AVX512BW-NEXT:    movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E
10277; AVX512BW-NEXT:    kmovq %rax, %k1
10278; AVX512BW-NEXT:    vmovdqu8 %zmm9, %zmm14 {%k1}
10279; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
10280; AVX512BW-NEXT:    vpshufb %xmm10, %xmm2, %xmm9
10281; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
10282; AVX512BW-NEXT:    vpshufb %xmm11, %xmm0, %xmm25
10283; AVX512BW-NEXT:    vporq %xmm9, %xmm25, %xmm9
10284; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
10285; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
10286; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm9, %zmm0
10287; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
10288; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm9
10289; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm25 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
10290; AVX512BW-NEXT:    vpshufb %xmm25, %xmm24, %xmm26
10291; AVX512BW-NEXT:    vporq %xmm9, %xmm26, %xmm9
10292; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm24[8],xmm1[9],xmm24[9],xmm1[10],xmm24[10],xmm1[11],xmm24[11],xmm1[12],xmm24[12],xmm1[13],xmm24[13],xmm1[14],xmm24[14],xmm1[15],xmm24[15]
10293; AVX512BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
10294; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
10295; AVX512BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm9, %zmm1
10296; AVX512BW-NEXT:    vpermq {{.*#+}} zmm9 = zmm1[0,1,0,1,4,5,4,5]
10297; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm9 {%k3}
10298; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm0 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
10299; AVX512BW-NEXT:    vpshufb %xmm0, %xmm22, %xmm1
10300; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm24 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54]
10301; AVX512BW-NEXT:    vpermi2w %zmm21, %zmm17, %zmm24
10302; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
10303; AVX512BW-NEXT:    vpshufb %xmm17, %xmm23, %xmm21
10304; AVX512BW-NEXT:    vporq %xmm1, %xmm21, %xmm1
10305; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm21 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15]
10306; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm21 = xmm21[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
10307; AVX512BW-NEXT:    vinserti32x4 $2, %xmm21, %zmm1, %zmm1
10308; AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5]
10309; AVX512BW-NEXT:    movabsq $290499906672591364, %rax # imm = 0x408102040810204
10310; AVX512BW-NEXT:    kmovq %rax, %k1
10311; AVX512BW-NEXT:    vmovdqu8 %zmm24, %zmm1 {%k1}
10312; AVX512BW-NEXT:    movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387
10313; AVX512BW-NEXT:    kmovq %rax, %k1
10314; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm9 {%k1}
10315; AVX512BW-NEXT:    vpshufb %xmm10, %xmm5, %xmm1
10316; AVX512BW-NEXT:    vpshufb %xmm11, %xmm4, %xmm10
10317; AVX512BW-NEXT:    vpor %xmm1, %xmm10, %xmm1
10318; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
10319; AVX512BW-NEXT:    vpshufb %xmm19, %xmm4, %xmm4
10320; AVX512BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm4, %zmm1
10321; AVX512BW-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
10322; AVX512BW-NEXT:    vpshufb %xmm25, %xmm6, %xmm4
10323; AVX512BW-NEXT:    vpor %xmm2, %xmm4, %xmm2
10324; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
10325; AVX512BW-NEXT:    vpshufb %xmm18, %xmm4, %xmm4
10326; AVX512BW-NEXT:    vinserti32x4 $2, %xmm2, %zmm4, %zmm2
10327; AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5]
10328; AVX512BW-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[0,1,0,1,4,5,4,5]
10329; AVX512BW-NEXT:    movabsq $871499720017774092, %rax # imm = 0xC183060C183060C
10330; AVX512BW-NEXT:    kmovq %rax, %k1
10331; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm2 {%k1}
10332; AVX512BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10333; AVX512BW-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
10334; AVX512BW-NEXT:    vpshufb %xmm17, %xmm12, %xmm1
10335; AVX512BW-NEXT:    vpor %xmm0, %xmm1, %xmm0
10336; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
10337; AVX512BW-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
10338; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
10339; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20]
10340; AVX512BW-NEXT:    vpermw %zmm13, %zmm1, %zmm1
10341; AVX512BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
10342; AVX512BW-NEXT:    movabsq $4647998506761461824, %rax # imm = 0x4081020408102040
10343; AVX512BW-NEXT:    kmovq %rax, %k1
10344; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
10345; AVX512BW-NEXT:    movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870
10346; AVX512BW-NEXT:    kmovq %rax, %k1
10347; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm2 {%k1}
10348; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10349; AVX512BW-NEXT:    vmovdqa64 %zmm2, (%rax)
10350; AVX512BW-NEXT:    vmovdqa64 %zmm8, 320(%rax)
10351; AVX512BW-NEXT:    vmovdqa64 %zmm9, 256(%rax)
10352; AVX512BW-NEXT:    vmovdqa64 %zmm14, 192(%rax)
10353; AVX512BW-NEXT:    vmovdqa64 %zmm20, 128(%rax)
10354; AVX512BW-NEXT:    vmovdqa64 %zmm3, 64(%rax)
10355; AVX512BW-NEXT:    vmovdqa64 %zmm16, 384(%rax)
10356; AVX512BW-NEXT:    vzeroupper
10357; AVX512BW-NEXT:    retq
10358;
10359; AVX512BW-FCP-LABEL: store_i8_stride7_vf64:
10360; AVX512BW-FCP:       # %bb.0:
10361; AVX512BW-FCP-NEXT:    subq $104, %rsp
10362; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10363; AVX512BW-FCP-NEXT:    vmovdqa (%rax), %ymm2
10364; AVX512BW-FCP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
10365; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
10366; AVX512BW-FCP-NEXT:    vpshufb %ymm20, %ymm2, %ymm0
10367; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
10368; AVX512BW-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
10369; AVX512BW-FCP-NEXT:    vpermw %ymm2, %ymm1, %ymm1
10370; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm2
10371; AVX512BW-FCP-NEXT:    vmovdqa (%r9), %ymm1
10372; AVX512BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10373; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
10374; AVX512BW-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm3
10375; AVX512BW-FCP-NEXT:    vmovdqa (%r8), %ymm4
10376; AVX512BW-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10377; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
10378; AVX512BW-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm4
10379; AVX512BW-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
10380; AVX512BW-FCP-NEXT:    vmovdqa64 (%r9), %xmm16
10381; AVX512BW-FCP-NEXT:    vmovdqa (%r8), %xmm15
10382; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm16[8],xmm15[9],xmm16[9],xmm15[10],xmm16[10],xmm15[11],xmm16[11],xmm15[12],xmm16[12],xmm15[13],xmm16[13],xmm15[14],xmm16[14],xmm15[15],xmm16[15]
10383; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
10384; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
10385; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
10386; AVX512BW-FCP-NEXT:    movabsq $2323999253380730912, %r10 # imm = 0x2040810204081020
10387; AVX512BW-FCP-NEXT:    kmovq %r10, %k1
10388; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm3 {%k1}
10389; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %ymm4
10390; AVX512BW-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10391; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
10392; AVX512BW-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm4
10393; AVX512BW-FCP-NEXT:    vmovdqa (%rcx), %ymm6
10394; AVX512BW-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10395; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
10396; AVX512BW-FCP-NEXT:    vpshufb %ymm5, %ymm6, %ymm6
10397; AVX512BW-FCP-NEXT:    vpor %ymm4, %ymm6, %ymm4
10398; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %xmm17
10399; AVX512BW-FCP-NEXT:    vmovdqa64 (%rcx), %xmm19
10400; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm19[8],xmm17[8],xmm19[9],xmm17[9],xmm19[10],xmm17[10],xmm19[11],xmm17[11],xmm19[12],xmm17[12],xmm19[13],xmm17[13],xmm19[14],xmm17[14],xmm19[15],xmm17[15]
10401; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm29 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
10402; AVX512BW-FCP-NEXT:    vpshufb %xmm29, %xmm6, %xmm6
10403; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
10404; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm6, %zmm4
10405; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm6
10406; AVX512BW-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10407; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
10408; AVX512BW-FCP-NEXT:    vpshufb %ymm9, %ymm6, %ymm6
10409; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %ymm7
10410; AVX512BW-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10411; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm23 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
10412; AVX512BW-FCP-NEXT:    vpshufb %ymm23, %ymm7, %ymm21
10413; AVX512BW-FCP-NEXT:    vporq %ymm6, %ymm21, %ymm6
10414; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %xmm21
10415; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm22
10416; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm24 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15]
10417; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
10418; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm24, %xmm24
10419; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm24 = ymm24[0,1,0,1]
10420; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm24, %zmm14
10421; AVX512BW-FCP-NEXT:    movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306
10422; AVX512BW-FCP-NEXT:    kmovq %r10, %k1
10423; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm14 {%k1}
10424; AVX512BW-FCP-NEXT:    movabsq $4066998693416279096, %r10 # imm = 0x3870E1C3870E1C38
10425; AVX512BW-FCP-NEXT:    kmovq %r10, %k1
10426; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm3, %zmm14 {%k1}
10427; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdx), %ymm3
10428; AVX512BW-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm4
10429; AVX512BW-FCP-NEXT:    vmovdqa 32(%rcx), %ymm2
10430; AVX512BW-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm5
10431; AVX512BW-FCP-NEXT:    vpor %ymm4, %ymm5, %ymm4
10432; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
10433; AVX512BW-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm5
10434; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
10435; AVX512BW-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm24
10436; AVX512BW-FCP-NEXT:    vporq %ymm5, %ymm24, %ymm5
10437; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3]
10438; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
10439; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
10440; AVX512BW-FCP-NEXT:    vpshufb %ymm9, %ymm5, %ymm9
10441; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rsi), %ymm25
10442; AVX512BW-FCP-NEXT:    vpshufb %ymm23, %ymm25, %ymm23
10443; AVX512BW-FCP-NEXT:    vporq %ymm9, %ymm23, %ymm9
10444; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128]
10445; AVX512BW-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm23
10446; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
10447; AVX512BW-FCP-NEXT:    vpshufb %ymm6, %ymm25, %ymm24
10448; AVX512BW-FCP-NEXT:    vporq %ymm23, %ymm24, %ymm23
10449; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm23 = ymm23[2,3,2,3]
10450; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm23, %zmm9, %zmm9
10451; AVX512BW-FCP-NEXT:    movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830
10452; AVX512BW-FCP-NEXT:    kmovq %r10, %k1
10453; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm9 {%k1}
10454; AVX512BW-FCP-NEXT:    vmovdqa 32(%r9), %ymm4
10455; AVX512BW-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
10456; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r8), %ymm28
10457; AVX512BW-FCP-NEXT:    vpshufb %ymm1, %ymm28, %ymm1
10458; AVX512BW-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
10459; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
10460; AVX512BW-FCP-NEXT:    vpshufb %ymm1, %ymm28, %ymm1
10461; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
10462; AVX512BW-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm23
10463; AVX512BW-FCP-NEXT:    vporq %ymm1, %ymm23, %ymm1
10464; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
10465; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
10466; AVX512BW-FCP-NEXT:    vmovdqa 32(%rax), %ymm6
10467; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm23 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10]
10468; AVX512BW-FCP-NEXT:    vpermw %ymm6, %ymm23, %ymm23
10469; AVX512BW-FCP-NEXT:    vpshufb %ymm20, %ymm6, %ymm20
10470; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm23, %zmm20, %zmm20
10471; AVX512BW-FCP-NEXT:    movabsq $145249953336295682, %r10 # imm = 0x204081020408102
10472; AVX512BW-FCP-NEXT:    kmovq %r10, %k2
10473; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm20, %zmm1 {%k2}
10474; AVX512BW-FCP-NEXT:    movabsq $-4357498600088870461, %r10 # imm = 0xC3870E1C3870E1C3
10475; AVX512BW-FCP-NEXT:    kmovq %r10, %k2
10476; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm9 {%k2}
10477; AVX512BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm23
10478; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm23[4,5,6,7]
10479; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,zmm1[23],zero,zmm1[21,22,23,26],zero,zmm1[24],zero,zmm1[28,29,26,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,59],zero,zero,zero,zero,zmm1[62],zero,zmm1[60],zero,zero,zero,zero,zmm1[63],zero,zmm1[61],zero
10480; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm24
10481; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm24[4,5,6,7]
10482; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm2[25],zero,zmm2[23],zero,zero,zero,zero,zmm2[26],zero,zmm2[24],zero,zero,zero,zero,zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[62],zero,zmm2[60],zero,zero,zero,zero,zmm2[63],zero,zmm2[61],zero,zero
10483; AVX512BW-FCP-NEXT:    vporq %zmm1, %zmm2, %zmm1
10484; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm26
10485; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm25[0,1,2,3],zmm26[4,5,6,7]
10486; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,zmm2[23],zero,zero,zero,zero,zmm2[26],zero,zmm2[24],zero,zero,zero,zero,zmm2[27],zero,zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,60,61,62],zero,zmm2[60],zero,zmm2[62,63,62,63],zero,zmm2[61],zero,zmm2[63,60,61]
10487; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %zmm27
10488; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm5[0,1,2,3],zmm27[4,5,6,7]
10489; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm3[23],zero,zero,zero,zero,zmm3[26],zero,zmm3[24],zero,zero,zero,zero,zmm3[27],zero,zmm3[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zmm3[62],zero,zmm3[60],zero,zero,zero,zero,zmm3[63],zero,zmm3[61],zero,zero,zero
10490; AVX512BW-FCP-NEXT:    vporq %zmm2, %zmm3, %zmm2
10491; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7]
10492; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm20 = zmm2[2,3,2,3,6,7,6,7]
10493; AVX512BW-FCP-NEXT:    movabsq $1742999440035548184, %r10 # imm = 0x183060C183060C18
10494; AVX512BW-FCP-NEXT:    kmovq %r10, %k2
10495; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm20 {%k2}
10496; AVX512BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm25
10497; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm28[0,1,2,3],zmm25[4,5,6,7]
10498; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm1[23],zero,zmm1[23,24,25,26],zero,zmm1[24],zero,zmm1[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm1[59],zero,zero,zero,zero,zmm1[62],zero,zmm1[60],zero,zero,zero,zero,zmm1[63],zero
10499; AVX512BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm28
10500; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm28[4,5,6,7]
10501; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[25],zero,zmm2[23],zero,zero,zero,zero,zmm2[26],zero,zmm2[24],zero,zero,zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm2[59],zero,zero,zero,zero,zmm2[62],zero,zmm2[60],zero,zero,zero,zero,zmm2[63],zero,zmm2[61]
10502; AVX512BW-FCP-NEXT:    vporq %zmm1, %zmm2, %zmm1
10503; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7]
10504; AVX512BW-FCP-NEXT:    movabsq $6971997760142192736, %r10 # imm = 0x60C183060C183060
10505; AVX512BW-FCP-NEXT:    kmovq %r10, %k2
10506; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm20 {%k2}
10507; AVX512BW-FCP-NEXT:    vmovdqa64 (%rax), %zmm31
10508; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63]
10509; AVX512BW-FCP-NEXT:    vpermi2w %zmm31, %zmm6, %zmm1
10510; AVX512BW-FCP-NEXT:    movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081
10511; AVX512BW-FCP-NEXT:    kmovq %rax, %k3
10512; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm20 {%k3}
10513; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdx), %xmm5
10514; AVX512BW-FCP-NEXT:    vmovdqa 32(%rcx), %xmm4
10515; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
10516; AVX512BW-FCP-NEXT:    vpshufb %xmm13, %xmm4, %xmm1
10517; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
10518; AVX512BW-FCP-NEXT:    vpshufb %xmm12, %xmm5, %xmm2
10519; AVX512BW-FCP-NEXT:    vpor %xmm1, %xmm2, %xmm1
10520; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
10521; AVX512BW-FCP-NEXT:    vpshufb %xmm29, %xmm2, %xmm2
10522; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm1, %zmm3
10523; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
10524; AVX512BW-FCP-NEXT:    vmovdqa 32(%rsi), %xmm1
10525; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
10526; AVX512BW-FCP-NEXT:    vpshufb %xmm11, %xmm1, %xmm29
10527; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
10528; AVX512BW-FCP-NEXT:    vpshufb %xmm8, %xmm2, %xmm30
10529; AVX512BW-FCP-NEXT:    vporq %xmm29, %xmm30, %xmm29
10530; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm30 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
10531; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm30, %xmm30
10532; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm30, %zmm29, %zmm29
10533; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[0,1,0,1,4,5,4,5]
10534; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm29 = zmm29[0,1,0,1,4,5,4,5]
10535; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm3, %zmm29 {%k2}
10536; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r9), %xmm30
10537; AVX512BW-FCP-NEXT:    vmovdqa 32(%r8), %xmm3
10538; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
10539; AVX512BW-FCP-NEXT:    vpshufb %xmm10, %xmm30, %xmm0
10540; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
10541; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm18
10542; AVX512BW-FCP-NEXT:    vporq %xmm0, %xmm18, %xmm0
10543; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm3[8],xmm30[8],xmm3[9],xmm30[9],xmm3[10],xmm30[10],xmm3[11],xmm30[11],xmm3[12],xmm30[12],xmm3[13],xmm30[13],xmm3[14],xmm30[14],xmm3[15],xmm30[15]
10544; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
10545; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm18, %zmm0, %zmm0
10546; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm18 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54]
10547; AVX512BW-FCP-NEXT:    vpermi2w %zmm31, %zmm6, %zmm18
10548; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
10549; AVX512BW-FCP-NEXT:    movabsq $290499906672591364, %rax # imm = 0x408102040810204
10550; AVX512BW-FCP-NEXT:    kmovq %rax, %k3
10551; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm18, %zmm0 {%k3}
10552; AVX512BW-FCP-NEXT:    movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387
10553; AVX512BW-FCP-NEXT:    kmovq %rax, %k3
10554; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm29 {%k3}
10555; AVX512BW-FCP-NEXT:    vpshufb %xmm13, %xmm19, %xmm0
10556; AVX512BW-FCP-NEXT:    vpshufb %xmm12, %xmm17, %xmm6
10557; AVX512BW-FCP-NEXT:    vpor %xmm0, %xmm6, %xmm0
10558; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm17[0],xmm19[0],xmm17[1],xmm19[1],xmm17[2],xmm19[2],xmm17[3],xmm19[3],xmm17[4],xmm19[4],xmm17[5],xmm19[5],xmm17[6],xmm19[6],xmm17[7],xmm19[7]
10559; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
10560; AVX512BW-FCP-NEXT:    vpshufb %xmm12, %xmm6, %xmm6
10561; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm6, %zmm0
10562; AVX512BW-FCP-NEXT:    vpshufb %xmm11, %xmm22, %xmm6
10563; AVX512BW-FCP-NEXT:    vpshufb %xmm8, %xmm21, %xmm8
10564; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm8, %xmm6
10565; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm21[0],xmm22[0],xmm21[1],xmm22[1],xmm21[2],xmm22[2],xmm21[3],xmm22[3],xmm21[4],xmm22[4],xmm21[5],xmm22[5],xmm21[6],xmm22[6],xmm21[7],xmm22[7]
10566; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
10567; AVX512BW-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm8
10568; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm8, %zmm6
10569; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm8 = zmm0[0,1,0,1,4,5,4,5]
10570; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm6[0,1,0,1,4,5,4,5]
10571; AVX512BW-FCP-NEXT:    movabsq $871499720017774092, %rax # imm = 0xC183060C183060C
10572; AVX512BW-FCP-NEXT:    kmovq %rax, %k3
10573; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm8, %zmm0 {%k3}
10574; AVX512BW-FCP-NEXT:    vpshufb %xmm10, %xmm16, %xmm6
10575; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm15, %xmm7
10576; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
10577; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7]
10578; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
10579; AVX512BW-FCP-NEXT:    vpshufb %xmm10, %xmm7, %xmm7
10580; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm7, %zmm6
10581; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm6 = zmm6[0,1,0,1,4,5,4,5]
10582; AVX512BW-FCP-NEXT:    vinserti64x4 $1, (%rsp), %zmm31, %zmm8 # 32-byte Folded Reload
10583; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20]
10584; AVX512BW-FCP-NEXT:    vpermw %zmm8, %zmm7, %zmm7
10585; AVX512BW-FCP-NEXT:    movabsq $4647998506761461824, %rax # imm = 0x4081020408102040
10586; AVX512BW-FCP-NEXT:    kmovq %rax, %k3
10587; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm7, %zmm6 {%k3}
10588; AVX512BW-FCP-NEXT:    movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870
10589; AVX512BW-FCP-NEXT:    kmovq %rax, %k3
10590; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm6, %zmm0 {%k3}
10591; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
10592; AVX512BW-FCP-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
10593; AVX512BW-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
10594; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm13[28],zero,ymm13[30,31,30,31],zero,ymm13[29],zero,ymm13[31,28,29]
10595; AVX512BW-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
10596; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero,zero
10597; AVX512BW-FCP-NEXT:    vpor %ymm2, %ymm6, %ymm2
10598; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[2,3,2,3],zmm1[0,1,0,1]
10599; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
10600; AVX512BW-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
10601; AVX512BW-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
10602; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero
10603; AVX512BW-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10604; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero,zero
10605; AVX512BW-FCP-NEXT:    vpor %ymm4, %ymm5, %ymm4
10606; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm4[2,3,2,3],zmm2[0,1,0,1]
10607; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm2 {%k1}
10608; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm30[0],xmm3[1],xmm30[1],xmm3[2],xmm30[2],xmm3[3],xmm30[3],xmm3[4],xmm30[4],xmm3[5],xmm30[5],xmm3[6],xmm30[6],xmm3[7],xmm30[7]
10609; AVX512BW-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
10610; AVX512BW-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
10611; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero
10612; AVX512BW-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
10613; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29]
10614; AVX512BW-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
10615; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm3[2,3,2,3],zmm1[0,1,0,1]
10616; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16]
10617; AVX512BW-FCP-NEXT:    vpermw %zmm31, %zmm3, %zmm3
10618; AVX512BW-FCP-NEXT:    movabsq $580999813345182728, %rax # imm = 0x810204081020408
10619; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
10620; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm3, %zmm1 {%k1}
10621; AVX512BW-FCP-NEXT:    movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E
10622; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
10623; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm2 {%k1}
10624; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm26, %zmm1
10625; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[18,19,20,21],zero,zmm1[19],zero,zmm1[21,20,21,22],zero,zmm1[20],zero,zmm1[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm1[55],zero,zero,zero,zero,zmm1[58],zero,zmm1[56],zero,zero,zero,zero,zmm1[59],zero
10626; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm27, %zmm3
10627; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm3[21],zero,zmm3[19],zero,zero,zero,zero,zmm3[22],zero,zmm3[20],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm3[55],zero,zero,zero,zero,zmm3[58],zero,zmm3[56],zero,zero,zero,zero,zmm3[59],zero,zmm3[57]
10628; AVX512BW-FCP-NEXT:    vporq %zmm1, %zmm3, %zmm1
10629; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm24, %zmm3
10630; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm3[18],zero,zmm3[18,19,20,21],zero,zmm3[19],zero,zmm3[25,26,27,22],zero,zmm3[20],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm3[56,57],zero,zmm3[55],zero,zmm3[53,54,55,58],zero,zmm3[56],zero,zmm3[60,61,58,59]
10631; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm23, %zmm4
10632; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm4[18],zero,zero,zero,zero,zmm4[21],zero,zmm4[19],zero,zero,zero,zero,zmm4[22],zero,zmm4[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm4[57],zero,zmm4[55],zero,zero,zero,zero,zmm4[58],zero,zmm4[56],zero,zero,zero,zero
10633; AVX512BW-FCP-NEXT:    vporq %zmm3, %zmm4, %zmm3
10634; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7]
10635; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[2,3,2,3,6,7,6,7]
10636; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm3 {%k2}
10637; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm28, %zmm1
10638; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[20],zero,zmm1[18],zero,zmm1[20,21,20,21],zero,zmm1[19],zero,zmm1[19,20,21,22],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[56,57,56,57],zero,zmm1[55],zero,zmm1[55,56,57,58],zero,zmm1[56],zero,zmm1[62,63]
10639; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm25, %zmm4
10640; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm4[20],zero,zmm4[18],zero,zero,zero,zero,zmm4[21],zero,zmm4[19],zero,zero,zero,zero,zmm4[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm4[57],zero,zmm4[55],zero,zero,zero,zero,zmm4[58],zero,zmm4[56],zero,zero
10641; AVX512BW-FCP-NEXT:    vporq %zmm1, %zmm4, %zmm1
10642; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm4 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28]
10643; AVX512BW-FCP-NEXT:    vpermw %zmm8, %zmm4, %zmm4
10644; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7]
10645; AVX512BW-FCP-NEXT:    movabsq $1161999626690365456, %rax # imm = 0x1020408102040810
10646; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
10647; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm1 {%k1}
10648; AVX512BW-FCP-NEXT:    movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C
10649; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
10650; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm3 {%k1}
10651; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10652; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, 128(%rax)
10653; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, 320(%rax)
10654; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, 192(%rax)
10655; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
10656; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm29, 256(%rax)
10657; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, 64(%rax)
10658; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm20, 384(%rax)
10659; AVX512BW-FCP-NEXT:    addq $104, %rsp
10660; AVX512BW-FCP-NEXT:    vzeroupper
10661; AVX512BW-FCP-NEXT:    retq
10662;
10663; AVX512DQ-BW-LABEL: store_i8_stride7_vf64:
10664; AVX512DQ-BW:       # %bb.0:
10665; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10666; AVX512DQ-BW-NEXT:    vmovdqa (%rax), %ymm13
10667; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} ymm26 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
10668; AVX512DQ-BW-NEXT:    vpshufb %ymm26, %ymm13, %ymm0
10669; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
10670; AVX512DQ-BW-NEXT:    # ymm1 = mem[0,1,0,1]
10671; AVX512DQ-BW-NEXT:    vpermw %ymm13, %ymm1, %ymm1
10672; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
10673; AVX512DQ-BW-NEXT:    vmovdqa (%r9), %ymm9
10674; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} ymm17 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
10675; AVX512DQ-BW-NEXT:    vpshufb %ymm17, %ymm9, %ymm1
10676; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %ymm10
10677; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} ymm21 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
10678; AVX512DQ-BW-NEXT:    vpshufb %ymm21, %ymm10, %ymm2
10679; AVX512DQ-BW-NEXT:    vpor %ymm1, %ymm2, %ymm2
10680; AVX512DQ-BW-NEXT:    vmovdqa (%r9), %xmm1
10681; AVX512DQ-BW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10682; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %xmm12
10683; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15]
10684; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
10685; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
10686; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm4, %zmm8
10687; AVX512DQ-BW-NEXT:    movabsq $2323999253380730912, %r10 # imm = 0x2040810204081020
10688; AVX512DQ-BW-NEXT:    kmovq %r10, %k1
10689; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm8 {%k1}
10690; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %ymm14
10691; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
10692; AVX512DQ-BW-NEXT:    vpshufb %ymm0, %ymm14, %ymm2
10693; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %ymm15
10694; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
10695; AVX512DQ-BW-NEXT:    vpshufb %ymm20, %ymm15, %ymm4
10696; AVX512DQ-BW-NEXT:    vpor %ymm2, %ymm4, %ymm2
10697; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm4
10698; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %xmm5
10699; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
10700; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
10701; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
10702; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm6, %zmm22
10703; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %ymm18
10704; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} ymm24 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
10705; AVX512DQ-BW-NEXT:    vpshufb %ymm24, %ymm18, %ymm2
10706; AVX512DQ-BW-NEXT:    vmovdqa64 (%rsi), %ymm19
10707; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} ymm25 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
10708; AVX512DQ-BW-NEXT:    vpshufb %ymm25, %ymm19, %ymm6
10709; AVX512DQ-BW-NEXT:    vpor %ymm2, %ymm6, %ymm2
10710; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm6
10711; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %xmm7
10712; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm23 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
10713; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm23 = xmm23[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
10714; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm23 = ymm23[0,1,0,1]
10715; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm23, %zmm3
10716; AVX512DQ-BW-NEXT:    movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306
10717; AVX512DQ-BW-NEXT:    kmovq %r10, %k1
10718; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm22, %zmm3 {%k1}
10719; AVX512DQ-BW-NEXT:    movabsq $4066998693416279096, %r10 # imm = 0x3870E1C3870E1C38
10720; AVX512DQ-BW-NEXT:    kmovq %r10, %k1
10721; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm8, %zmm3 {%k1}
10722; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdx), %ymm29
10723; AVX512DQ-BW-NEXT:    vpshufb %ymm0, %ymm29, %ymm0
10724; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rcx), %ymm30
10725; AVX512DQ-BW-NEXT:    vpshufb %ymm20, %ymm30, %ymm8
10726; AVX512DQ-BW-NEXT:    vpor %ymm0, %ymm8, %ymm0
10727; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
10728; AVX512DQ-BW-NEXT:    vpshufb %ymm20, %ymm29, %ymm8
10729; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
10730; AVX512DQ-BW-NEXT:    vpshufb %ymm22, %ymm30, %ymm23
10731; AVX512DQ-BW-NEXT:    vporq %ymm8, %ymm23, %ymm8
10732; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
10733; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
10734; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rsi), %ymm28
10735; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdi), %ymm16
10736; AVX512DQ-BW-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm16[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
10737; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[0,0,1,1,4,4,5,5]
10738; AVX512DQ-BW-NEXT:    vpbroadcastd {{.*#+}} ymm23 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6]
10739; AVX512DQ-BW-NEXT:    movl $676341840, %r10d # imm = 0x28502850
10740; AVX512DQ-BW-NEXT:    kmovd %r10d, %k1
10741; AVX512DQ-BW-NEXT:    vpshufb %ymm23, %ymm28, %ymm8 {%k1}
10742; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
10743; AVX512DQ-BW-NEXT:    vpshufb %ymm24, %ymm16, %ymm24
10744; AVX512DQ-BW-NEXT:    vpshufb %ymm25, %ymm28, %ymm25
10745; AVX512DQ-BW-NEXT:    vporq %ymm24, %ymm25, %ymm24
10746; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm24, %zmm8
10747; AVX512DQ-BW-NEXT:    movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830
10748; AVX512DQ-BW-NEXT:    kmovq %r10, %k2
10749; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm8 {%k2}
10750; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r9), %ymm31
10751; AVX512DQ-BW-NEXT:    vpshufb %ymm17, %ymm31, %ymm17
10752; AVX512DQ-BW-NEXT:    vmovdqa 32(%r8), %ymm1
10753; AVX512DQ-BW-NEXT:    vpshufb %ymm21, %ymm1, %ymm21
10754; AVX512DQ-BW-NEXT:    vporq %ymm17, %ymm21, %ymm17
10755; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
10756; AVX512DQ-BW-NEXT:    vpshufb %ymm24, %ymm1, %ymm21
10757; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
10758; AVX512DQ-BW-NEXT:    vpshufb %ymm25, %ymm31, %ymm27
10759; AVX512DQ-BW-NEXT:    vporq %ymm21, %ymm27, %ymm21
10760; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm21 = ymm21[2,3,2,3]
10761; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm21, %zmm17, %zmm21
10762; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rax), %ymm17
10763; AVX512DQ-BW-NEXT:    vpshufb %ymm26, %ymm17, %ymm27
10764; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm26 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28]
10765; AVX512DQ-BW-NEXT:    vpermw %ymm17, %ymm26, %ymm11
10766; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm11, %zmm27, %zmm11
10767; AVX512DQ-BW-NEXT:    movabsq $145249953336295682, %r10 # imm = 0x204081020408102
10768; AVX512DQ-BW-NEXT:    kmovq %r10, %k3
10769; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm11, %zmm21 {%k3}
10770; AVX512DQ-BW-NEXT:    movabsq $-4357498600088870461, %r10 # imm = 0xC3870E1C3870E1C3
10771; AVX512DQ-BW-NEXT:    kmovq %r10, %k3
10772; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm21, %zmm8 {%k3}
10773; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm16[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
10774; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm11 = ymm11[2,2,3,3,6,6,7,7]
10775; AVX512DQ-BW-NEXT:    movl $338170920, %r10d # imm = 0x14281428
10776; AVX512DQ-BW-NEXT:    kmovd %r10d, %k4
10777; AVX512DQ-BW-NEXT:    vpbroadcastd {{.*#+}} ymm27 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14]
10778; AVX512DQ-BW-NEXT:    vpshufb %ymm27, %ymm28, %ymm11 {%k4}
10779; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128]
10780; AVX512DQ-BW-NEXT:    # ymm2 = mem[0,1,0,1]
10781; AVX512DQ-BW-NEXT:    vpshufb %ymm2, %ymm28, %ymm21
10782; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} ymm28 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
10783; AVX512DQ-BW-NEXT:    vpshufb %ymm28, %ymm16, %ymm16
10784; AVX512DQ-BW-NEXT:    vporq %ymm21, %ymm16, %ymm16
10785; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm11, %zmm16, %zmm16
10786; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm29[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
10787; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm21 = ymm11[0,2,3,3,4,6,7,7]
10788; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm11 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12]
10789; AVX512DQ-BW-NEXT:    vpshufb %ymm11, %ymm30, %ymm0
10790; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm21, %ymm0 {%k1}
10791; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm21 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm29[24,25],zero,ymm29[23],zero,ymm29[21,22,23,26],zero,ymm29[24],zero,ymm29[28,29,26,27]
10792; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm29 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm30[25],zero,ymm30[23],zero,zero,zero,zero,ymm30[26],zero,ymm30[24],zero,zero,zero,zero
10793; AVX512DQ-BW-NEXT:    vporq %ymm21, %ymm29, %ymm21
10794; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm21, %zmm0
10795; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7]
10796; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7]
10797; AVX512DQ-BW-NEXT:    movabsq $1742999440035548184, %r10 # imm = 0x183060C183060C18
10798; AVX512DQ-BW-NEXT:    kmovq %r10, %k3
10799; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm16 {%k3}
10800; AVX512DQ-BW-NEXT:    vmovdqa64 (%r9), %zmm0
10801; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm0[4,5,6,7]
10802; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm1[23],zero,zmm1[23,24,25,26],zero,zmm1[24],zero,zmm1[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm1[59],zero,zero,zero,zero,zmm1[62],zero,zmm1[60],zero,zero,zero,zero,zmm1[63],zero
10803; AVX512DQ-BW-NEXT:    vmovdqa64 (%r8), %zmm29
10804; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm31[0,1,2,3],zmm29[4,5,6,7]
10805; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm21 = zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm21[25],zero,zmm21[23],zero,zero,zero,zero,zmm21[26],zero,zmm21[24],zero,zero,zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm21[59],zero,zero,zero,zero,zmm21[62],zero,zmm21[60],zero,zero,zero,zero,zmm21[63],zero,zmm21[61]
10806; AVX512DQ-BW-NEXT:    vporq %zmm1, %zmm21, %zmm1
10807; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7]
10808; AVX512DQ-BW-NEXT:    movabsq $6971997760142192736, %r10 # imm = 0x60C183060C183060
10809; AVX512DQ-BW-NEXT:    kmovq %r10, %k3
10810; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm1, %zmm16 {%k3}
10811; AVX512DQ-BW-NEXT:    vmovdqa64 (%rax), %zmm21
10812; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63]
10813; AVX512DQ-BW-NEXT:    vpermi2w %zmm21, %zmm17, %zmm1
10814; AVX512DQ-BW-NEXT:    movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081
10815; AVX512DQ-BW-NEXT:    kmovq %rax, %k5
10816; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm1, %zmm16 {%k5}
10817; AVX512DQ-BW-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm18[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
10818; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5]
10819; AVX512DQ-BW-NEXT:    vpshufb %ymm23, %ymm19, %ymm1 {%k1}
10820; AVX512DQ-BW-NEXT:    vpshufb %ymm2, %ymm19, %ymm2
10821; AVX512DQ-BW-NEXT:    vpshufb %ymm28, %ymm18, %ymm23
10822; AVX512DQ-BW-NEXT:    vporq %ymm2, %ymm23, %ymm2
10823; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
10824; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdx), %zmm2
10825; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm14, %zmm2, %zmm2
10826; AVX512DQ-BW-NEXT:    vpshufb %zmm20, %zmm2, %zmm2
10827; AVX512DQ-BW-NEXT:    vmovdqa64 (%rcx), %zmm20
10828; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm15, %zmm20, %zmm20
10829; AVX512DQ-BW-NEXT:    vpshufb %zmm22, %zmm20, %zmm20
10830; AVX512DQ-BW-NEXT:    vporq %zmm2, %zmm20, %zmm2
10831; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7]
10832; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm20 = zmm2[2,3,2,3,6,7,6,7]
10833; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm1, %zmm20 {%k3}
10834; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r9), %xmm22
10835; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm10, %zmm29, %zmm1
10836; AVX512DQ-BW-NEXT:    vpshufb %zmm24, %zmm1, %zmm1
10837; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r8), %xmm23
10838; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm0
10839; AVX512DQ-BW-NEXT:    vpshufb %zmm25, %zmm0, %zmm2
10840; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdx), %xmm0
10841; AVX512DQ-BW-NEXT:    vporq %zmm1, %zmm2, %zmm1
10842; AVX512DQ-BW-NEXT:    vmovdqa 32(%rcx), %xmm2
10843; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7]
10844; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm13, %zmm21, %zmm13
10845; AVX512DQ-BW-NEXT:    vpermw %zmm13, %zmm26, %zmm24
10846; AVX512DQ-BW-NEXT:    movabsq $1161999626690365456, %rax # imm = 0x1020408102040810
10847; AVX512DQ-BW-NEXT:    kmovq %rax, %k5
10848; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm24, %zmm1 {%k5}
10849; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdi), %xmm24
10850; AVX512DQ-BW-NEXT:    movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C
10851; AVX512DQ-BW-NEXT:    kmovq %rax, %k5
10852; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm1, %zmm20 {%k5}
10853; AVX512DQ-BW-NEXT:    vmovdqa 32(%rsi), %xmm1
10854; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} ymm18 = ymm18[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
10855; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm25 = ymm18[2,2,3,3,6,6,7,7]
10856; AVX512DQ-BW-NEXT:    vpshufb %ymm27, %ymm19, %ymm25 {%k4}
10857; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm19 = xmm24[0],xmm1[0],xmm24[1],xmm1[1],xmm24[2],xmm1[2],xmm24[3],xmm1[3],xmm24[4],xmm1[4],xmm24[5],xmm1[5],xmm24[6],xmm1[6],xmm24[7],xmm1[7]
10858; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm18 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
10859; AVX512DQ-BW-NEXT:    vpshufb %xmm18, %xmm19, %xmm19
10860; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm25 = zmm25[2,3,2,3],zmm19[0,1,0,1]
10861; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm26 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
10862; AVX512DQ-BW-NEXT:    vpshufb %ymm11, %ymm15, %ymm11
10863; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
10864; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm26, %xmm15
10865; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
10866; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm14 = ymm14[0,2,3,3,4,6,7,7]
10867; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm14, %ymm11 {%k1}
10868; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm14 = zmm11[2,3,2,3],zmm15[0,1,0,1]
10869; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm25, %zmm14 {%k2}
10870; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7]
10871; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm15 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
10872; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm11, %xmm11
10873; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm9[27],zero,zero,zero,zero,ymm9[30],zero,ymm9[28],zero,zero,zero,zero,ymm9[31],zero
10874; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[27],zero,zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero,ymm10[29]
10875; AVX512DQ-BW-NEXT:    vpor %ymm9, %ymm10, %ymm9
10876; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm9 = zmm9[2,3,2,3],zmm11[0,1,0,1]
10877; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm10 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16]
10878; AVX512DQ-BW-NEXT:    vpermw %zmm21, %zmm10, %zmm10
10879; AVX512DQ-BW-NEXT:    movabsq $580999813345182728, %rax # imm = 0x810204081020408
10880; AVX512DQ-BW-NEXT:    kmovq %rax, %k1
10881; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm10, %zmm9 {%k1}
10882; AVX512DQ-BW-NEXT:    movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E
10883; AVX512DQ-BW-NEXT:    kmovq %rax, %k1
10884; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm9, %zmm14 {%k1}
10885; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
10886; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm2, %xmm9
10887; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
10888; AVX512DQ-BW-NEXT:    vpshufb %xmm11, %xmm0, %xmm25
10889; AVX512DQ-BW-NEXT:    vporq %xmm9, %xmm25, %xmm9
10890; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
10891; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
10892; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm9, %zmm0
10893; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
10894; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm9
10895; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm25 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
10896; AVX512DQ-BW-NEXT:    vpshufb %xmm25, %xmm24, %xmm26
10897; AVX512DQ-BW-NEXT:    vporq %xmm9, %xmm26, %xmm9
10898; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm24[8],xmm1[9],xmm24[9],xmm1[10],xmm24[10],xmm1[11],xmm24[11],xmm1[12],xmm24[12],xmm1[13],xmm24[13],xmm1[14],xmm24[14],xmm1[15],xmm24[15]
10899; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
10900; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
10901; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm9, %zmm1
10902; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm9 = zmm1[0,1,0,1,4,5,4,5]
10903; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm9 {%k3}
10904; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm0 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
10905; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm22, %xmm1
10906; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm24 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54]
10907; AVX512DQ-BW-NEXT:    vpermi2w %zmm21, %zmm17, %zmm24
10908; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
10909; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm23, %xmm21
10910; AVX512DQ-BW-NEXT:    vporq %xmm1, %xmm21, %xmm1
10911; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm21 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15]
10912; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm21 = xmm21[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
10913; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm21, %zmm1, %zmm1
10914; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5]
10915; AVX512DQ-BW-NEXT:    movabsq $290499906672591364, %rax # imm = 0x408102040810204
10916; AVX512DQ-BW-NEXT:    kmovq %rax, %k1
10917; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm24, %zmm1 {%k1}
10918; AVX512DQ-BW-NEXT:    movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387
10919; AVX512DQ-BW-NEXT:    kmovq %rax, %k1
10920; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm1, %zmm9 {%k1}
10921; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm5, %xmm1
10922; AVX512DQ-BW-NEXT:    vpshufb %xmm11, %xmm4, %xmm10
10923; AVX512DQ-BW-NEXT:    vpor %xmm1, %xmm10, %xmm1
10924; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
10925; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm4, %xmm4
10926; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm4, %zmm1
10927; AVX512DQ-BW-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
10928; AVX512DQ-BW-NEXT:    vpshufb %xmm25, %xmm6, %xmm4
10929; AVX512DQ-BW-NEXT:    vpor %xmm2, %xmm4, %xmm2
10930; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
10931; AVX512DQ-BW-NEXT:    vpshufb %xmm18, %xmm4, %xmm4
10932; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm2, %zmm4, %zmm2
10933; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5]
10934; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[0,1,0,1,4,5,4,5]
10935; AVX512DQ-BW-NEXT:    movabsq $871499720017774092, %rax # imm = 0xC183060C183060C
10936; AVX512DQ-BW-NEXT:    kmovq %rax, %k1
10937; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm1, %zmm2 {%k1}
10938; AVX512DQ-BW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10939; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
10940; AVX512DQ-BW-NEXT:    vpshufb %xmm17, %xmm12, %xmm1
10941; AVX512DQ-BW-NEXT:    vpor %xmm0, %xmm1, %xmm0
10942; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
10943; AVX512DQ-BW-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
10944; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
10945; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20]
10946; AVX512DQ-BW-NEXT:    vpermw %zmm13, %zmm1, %zmm1
10947; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
10948; AVX512DQ-BW-NEXT:    movabsq $4647998506761461824, %rax # imm = 0x4081020408102040
10949; AVX512DQ-BW-NEXT:    kmovq %rax, %k1
10950; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
10951; AVX512DQ-BW-NEXT:    movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870
10952; AVX512DQ-BW-NEXT:    kmovq %rax, %k1
10953; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm2 {%k1}
10954; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10955; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, (%rax)
10956; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, 320(%rax)
10957; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, 256(%rax)
10958; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, 192(%rax)
10959; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm20, 128(%rax)
10960; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, 64(%rax)
10961; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, 384(%rax)
10962; AVX512DQ-BW-NEXT:    vzeroupper
10963; AVX512DQ-BW-NEXT:    retq
10964;
10965; AVX512DQ-BW-FCP-LABEL: store_i8_stride7_vf64:
10966; AVX512DQ-BW-FCP:       # %bb.0:
10967; AVX512DQ-BW-FCP-NEXT:    subq $104, %rsp
10968; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10969; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rax), %ymm2
10970; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm2, (%rsp) # 32-byte Spill
10971; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
10972; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm20, %ymm2, %ymm0
10973; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
10974; AVX512DQ-BW-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
10975; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm2, %ymm1, %ymm1
10976; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm2
10977; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r9), %ymm1
10978; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10979; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
10980; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm3
10981; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r8), %ymm4
10982; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10983; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
10984; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm4
10985; AVX512DQ-BW-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
10986; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r9), %xmm16
10987; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r8), %xmm15
10988; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm16[8],xmm15[9],xmm16[9],xmm15[10],xmm16[10],xmm15[11],xmm16[11],xmm15[12],xmm16[12],xmm15[13],xmm16[13],xmm15[14],xmm16[14],xmm15[15],xmm16[15]
10989; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
10990; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
10991; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
10992; AVX512DQ-BW-FCP-NEXT:    movabsq $2323999253380730912, %r10 # imm = 0x2040810204081020
10993; AVX512DQ-BW-FCP-NEXT:    kmovq %r10, %k1
10994; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm3 {%k1}
10995; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %ymm4
10996; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10997; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
10998; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm4
10999; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rcx), %ymm6
11000; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11001; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
11002; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm5, %ymm6, %ymm6
11003; AVX512DQ-BW-FCP-NEXT:    vpor %ymm4, %ymm6, %ymm4
11004; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdx), %xmm17
11005; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rcx), %xmm19
11006; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm19[8],xmm17[8],xmm19[9],xmm17[9],xmm19[10],xmm17[10],xmm19[11],xmm17[11],xmm19[12],xmm17[12],xmm19[13],xmm17[13],xmm19[14],xmm17[14],xmm19[15],xmm17[15]
11007; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm29 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
11008; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm29, %xmm6, %xmm6
11009; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
11010; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm6, %zmm4
11011; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm6
11012; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11013; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
11014; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm9, %ymm6, %ymm6
11015; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %ymm7
11016; AVX512DQ-BW-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11017; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} ymm23 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
11018; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm23, %ymm7, %ymm21
11019; AVX512DQ-BW-FCP-NEXT:    vporq %ymm6, %ymm21, %ymm6
11020; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %xmm21
11021; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm22
11022; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm24 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15]
11023; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
11024; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm24, %xmm24
11025; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm24 = ymm24[0,1,0,1]
11026; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm24, %zmm14
11027; AVX512DQ-BW-FCP-NEXT:    movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306
11028; AVX512DQ-BW-FCP-NEXT:    kmovq %r10, %k1
11029; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm14 {%k1}
11030; AVX512DQ-BW-FCP-NEXT:    movabsq $4066998693416279096, %r10 # imm = 0x3870E1C3870E1C38
11031; AVX512DQ-BW-FCP-NEXT:    kmovq %r10, %k1
11032; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm3, %zmm14 {%k1}
11033; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdx), %ymm3
11034; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm4
11035; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rcx), %ymm2
11036; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm5
11037; AVX512DQ-BW-FCP-NEXT:    vpor %ymm4, %ymm5, %ymm4
11038; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
11039; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm5
11040; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
11041; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm24
11042; AVX512DQ-BW-FCP-NEXT:    vporq %ymm5, %ymm24, %ymm5
11043; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3]
11044; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
11045; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
11046; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm9, %ymm5, %ymm9
11047; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rsi), %ymm25
11048; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm23, %ymm25, %ymm23
11049; AVX512DQ-BW-FCP-NEXT:    vporq %ymm9, %ymm23, %ymm9
11050; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128]
11051; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm23
11052; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
11053; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm6, %ymm25, %ymm24
11054; AVX512DQ-BW-FCP-NEXT:    vporq %ymm23, %ymm24, %ymm23
11055; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm23 = ymm23[2,3,2,3]
11056; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm23, %zmm9, %zmm9
11057; AVX512DQ-BW-FCP-NEXT:    movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830
11058; AVX512DQ-BW-FCP-NEXT:    kmovq %r10, %k1
11059; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm9 {%k1}
11060; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%r9), %ymm4
11061; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
11062; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r8), %ymm28
11063; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm1, %ymm28, %ymm1
11064; AVX512DQ-BW-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
11065; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
11066; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm1, %ymm28, %ymm1
11067; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
11068; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm6, %ymm4, %ymm23
11069; AVX512DQ-BW-FCP-NEXT:    vporq %ymm1, %ymm23, %ymm1
11070; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
11071; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
11072; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rax), %ymm6
11073; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm23 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10]
11074; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm6, %ymm23, %ymm23
11075; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm20, %ymm6, %ymm20
11076; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm23, %zmm20, %zmm20
11077; AVX512DQ-BW-FCP-NEXT:    movabsq $145249953336295682, %r10 # imm = 0x204081020408102
11078; AVX512DQ-BW-FCP-NEXT:    kmovq %r10, %k2
11079; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm20, %zmm1 {%k2}
11080; AVX512DQ-BW-FCP-NEXT:    movabsq $-4357498600088870461, %r10 # imm = 0xC3870E1C3870E1C3
11081; AVX512DQ-BW-FCP-NEXT:    kmovq %r10, %k2
11082; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm9 {%k2}
11083; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm23
11084; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm23[4,5,6,7]
11085; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,zmm1[23],zero,zmm1[21,22,23,26],zero,zmm1[24],zero,zmm1[28,29,26,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,59],zero,zero,zero,zero,zmm1[62],zero,zmm1[60],zero,zero,zero,zero,zmm1[63],zero,zmm1[61],zero
11086; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm24
11087; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm24[4,5,6,7]
11088; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm2[25],zero,zmm2[23],zero,zero,zero,zero,zmm2[26],zero,zmm2[24],zero,zero,zero,zero,zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[62],zero,zmm2[60],zero,zero,zero,zero,zmm2[63],zero,zmm2[61],zero,zero
11089; AVX512DQ-BW-FCP-NEXT:    vporq %zmm1, %zmm2, %zmm1
11090; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm26
11091; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm25[0,1,2,3],zmm26[4,5,6,7]
11092; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,zmm2[23],zero,zero,zero,zero,zmm2[26],zero,zmm2[24],zero,zero,zero,zero,zmm2[27],zero,zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,60,61,62],zero,zmm2[60],zero,zmm2[62,63,62,63],zero,zmm2[61],zero,zmm2[63,60,61]
11093; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rsi), %zmm27
11094; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm5[0,1,2,3],zmm27[4,5,6,7]
11095; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm3[23],zero,zero,zero,zero,zmm3[26],zero,zmm3[24],zero,zero,zero,zero,zmm3[27],zero,zmm3[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zmm3[62],zero,zmm3[60],zero,zero,zero,zero,zmm3[63],zero,zmm3[61],zero,zero,zero
11096; AVX512DQ-BW-FCP-NEXT:    vporq %zmm2, %zmm3, %zmm2
11097; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7]
11098; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm20 = zmm2[2,3,2,3,6,7,6,7]
11099; AVX512DQ-BW-FCP-NEXT:    movabsq $1742999440035548184, %r10 # imm = 0x183060C183060C18
11100; AVX512DQ-BW-FCP-NEXT:    kmovq %r10, %k2
11101; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm20 {%k2}
11102; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm25
11103; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm28[0,1,2,3],zmm25[4,5,6,7]
11104; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm1[23],zero,zmm1[23,24,25,26],zero,zmm1[24],zero,zmm1[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm1[59],zero,zero,zero,zero,zmm1[62],zero,zmm1[60],zero,zero,zero,zero,zmm1[63],zero
11105; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm28
11106; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm28[4,5,6,7]
11107; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[25],zero,zmm2[23],zero,zero,zero,zero,zmm2[26],zero,zmm2[24],zero,zero,zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm2[59],zero,zero,zero,zero,zmm2[62],zero,zmm2[60],zero,zero,zero,zero,zmm2[63],zero,zmm2[61]
11108; AVX512DQ-BW-FCP-NEXT:    vporq %zmm1, %zmm2, %zmm1
11109; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7]
11110; AVX512DQ-BW-FCP-NEXT:    movabsq $6971997760142192736, %r10 # imm = 0x60C183060C183060
11111; AVX512DQ-BW-FCP-NEXT:    kmovq %r10, %k2
11112; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm20 {%k2}
11113; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rax), %zmm31
11114; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63]
11115; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm31, %zmm6, %zmm1
11116; AVX512DQ-BW-FCP-NEXT:    movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081
11117; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k3
11118; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm20 {%k3}
11119; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdx), %xmm5
11120; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rcx), %xmm4
11121; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
11122; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm13, %xmm4, %xmm1
11123; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
11124; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm12, %xmm5, %xmm2
11125; AVX512DQ-BW-FCP-NEXT:    vpor %xmm1, %xmm2, %xmm1
11126; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
11127; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm29, %xmm2, %xmm2
11128; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm1, %zmm3
11129; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
11130; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rsi), %xmm1
11131; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
11132; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm11, %xmm1, %xmm29
11133; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
11134; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm8, %xmm2, %xmm30
11135; AVX512DQ-BW-FCP-NEXT:    vporq %xmm29, %xmm30, %xmm29
11136; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm30 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
11137; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm30, %xmm30
11138; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm30, %zmm29, %zmm29
11139; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[0,1,0,1,4,5,4,5]
11140; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm29 = zmm29[0,1,0,1,4,5,4,5]
11141; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm3, %zmm29 {%k2}
11142; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r9), %xmm30
11143; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%r8), %xmm3
11144; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
11145; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm10, %xmm30, %xmm0
11146; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
11147; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm3, %xmm18
11148; AVX512DQ-BW-FCP-NEXT:    vporq %xmm0, %xmm18, %xmm0
11149; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm3[8],xmm30[8],xmm3[9],xmm30[9],xmm3[10],xmm30[10],xmm3[11],xmm30[11],xmm3[12],xmm30[12],xmm3[13],xmm30[13],xmm3[14],xmm30[14],xmm3[15],xmm30[15]
11150; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
11151; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm18, %zmm0, %zmm0
11152; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm18 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54]
11153; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm31, %zmm6, %zmm18
11154; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
11155; AVX512DQ-BW-FCP-NEXT:    movabsq $290499906672591364, %rax # imm = 0x408102040810204
11156; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k3
11157; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm18, %zmm0 {%k3}
11158; AVX512DQ-BW-FCP-NEXT:    movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387
11159; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k3
11160; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm29 {%k3}
11161; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm13, %xmm19, %xmm0
11162; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm12, %xmm17, %xmm6
11163; AVX512DQ-BW-FCP-NEXT:    vpor %xmm0, %xmm6, %xmm0
11164; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm17[0],xmm19[0],xmm17[1],xmm19[1],xmm17[2],xmm19[2],xmm17[3],xmm19[3],xmm17[4],xmm19[4],xmm17[5],xmm19[5],xmm17[6],xmm19[6],xmm17[7],xmm19[7]
11165; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
11166; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm12, %xmm6, %xmm6
11167; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm6, %zmm0
11168; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm11, %xmm22, %xmm6
11169; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm8, %xmm21, %xmm8
11170; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm8, %xmm6
11171; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm21[0],xmm22[0],xmm21[1],xmm22[1],xmm21[2],xmm22[2],xmm21[3],xmm22[3],xmm21[4],xmm22[4],xmm21[5],xmm22[5],xmm21[6],xmm22[6],xmm21[7],xmm22[7]
11172; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
11173; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm8
11174; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm8, %zmm6
11175; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm8 = zmm0[0,1,0,1,4,5,4,5]
11176; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm6[0,1,0,1,4,5,4,5]
11177; AVX512DQ-BW-FCP-NEXT:    movabsq $871499720017774092, %rax # imm = 0xC183060C183060C
11178; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k3
11179; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm8, %zmm0 {%k3}
11180; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm10, %xmm16, %xmm6
11181; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm15, %xmm7
11182; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm7, %xmm6
11183; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7]
11184; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
11185; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm10, %xmm7, %xmm7
11186; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm6, %zmm7, %zmm6
11187; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm6 = zmm6[0,1,0,1,4,5,4,5]
11188; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, (%rsp), %zmm31, %zmm8 # 32-byte Folded Reload
11189; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20]
11190; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm8, %zmm7, %zmm7
11191; AVX512DQ-BW-FCP-NEXT:    movabsq $4647998506761461824, %rax # imm = 0x4081020408102040
11192; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k3
11193; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm7, %zmm6 {%k3}
11194; AVX512DQ-BW-FCP-NEXT:    movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870
11195; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k3
11196; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm6, %zmm0 {%k3}
11197; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
11198; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
11199; AVX512DQ-BW-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
11200; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm13[28],zero,ymm13[30,31,30,31],zero,ymm13[29],zero,ymm13[31,28,29]
11201; AVX512DQ-BW-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
11202; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero,zero
11203; AVX512DQ-BW-FCP-NEXT:    vpor %ymm2, %ymm6, %ymm2
11204; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[2,3,2,3],zmm1[0,1,0,1]
11205; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
11206; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
11207; AVX512DQ-BW-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
11208; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero
11209; AVX512DQ-BW-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11210; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero,zero
11211; AVX512DQ-BW-FCP-NEXT:    vpor %ymm4, %ymm5, %ymm4
11212; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm4[2,3,2,3],zmm2[0,1,0,1]
11213; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm2 {%k1}
11214; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm30[0],xmm3[1],xmm30[1],xmm3[2],xmm30[2],xmm3[3],xmm30[3],xmm3[4],xmm30[4],xmm3[5],xmm30[5],xmm3[6],xmm30[6],xmm3[7],xmm30[7]
11215; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
11216; AVX512DQ-BW-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11217; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero
11218; AVX512DQ-BW-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11219; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29]
11220; AVX512DQ-BW-FCP-NEXT:    vpor %ymm3, %ymm4, %ymm3
11221; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm3[2,3,2,3],zmm1[0,1,0,1]
11222; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16]
11223; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm31, %zmm3, %zmm3
11224; AVX512DQ-BW-FCP-NEXT:    movabsq $580999813345182728, %rax # imm = 0x810204081020408
11225; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
11226; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm3, %zmm1 {%k1}
11227; AVX512DQ-BW-FCP-NEXT:    movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E
11228; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
11229; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm2 {%k1}
11230; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm26, %zmm1
11231; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[18,19,20,21],zero,zmm1[19],zero,zmm1[21,20,21,22],zero,zmm1[20],zero,zmm1[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm1[55],zero,zero,zero,zero,zmm1[58],zero,zmm1[56],zero,zero,zero,zero,zmm1[59],zero
11232; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm27, %zmm3
11233; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm3[21],zero,zmm3[19],zero,zero,zero,zero,zmm3[22],zero,zmm3[20],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm3[55],zero,zero,zero,zero,zmm3[58],zero,zmm3[56],zero,zero,zero,zero,zmm3[59],zero,zmm3[57]
11234; AVX512DQ-BW-FCP-NEXT:    vporq %zmm1, %zmm3, %zmm1
11235; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm24, %zmm3
11236; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm3[18],zero,zmm3[18,19,20,21],zero,zmm3[19],zero,zmm3[25,26,27,22],zero,zmm3[20],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm3[56,57],zero,zmm3[55],zero,zmm3[53,54,55,58],zero,zmm3[56],zero,zmm3[60,61,58,59]
11237; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm23, %zmm4
11238; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm4[18],zero,zero,zero,zero,zmm4[21],zero,zmm4[19],zero,zero,zero,zero,zmm4[22],zero,zmm4[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm4[57],zero,zmm4[55],zero,zero,zero,zero,zmm4[58],zero,zmm4[56],zero,zero,zero,zero
11239; AVX512DQ-BW-FCP-NEXT:    vporq %zmm3, %zmm4, %zmm3
11240; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7]
11241; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[2,3,2,3,6,7,6,7]
11242; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm3 {%k2}
11243; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm28, %zmm1
11244; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[20],zero,zmm1[18],zero,zmm1[20,21,20,21],zero,zmm1[19],zero,zmm1[19,20,21,22],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[56,57,56,57],zero,zmm1[55],zero,zmm1[55,56,57,58],zero,zmm1[56],zero,zmm1[62,63]
11245; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm25, %zmm4
11246; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm4[20],zero,zmm4[18],zero,zero,zero,zero,zmm4[21],zero,zmm4[19],zero,zero,zero,zero,zmm4[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm4[57],zero,zmm4[55],zero,zero,zero,zero,zmm4[58],zero,zmm4[56],zero,zero
11247; AVX512DQ-BW-FCP-NEXT:    vporq %zmm1, %zmm4, %zmm1
11248; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm4 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28]
11249; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm8, %zmm4, %zmm4
11250; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7]
11251; AVX512DQ-BW-FCP-NEXT:    movabsq $1161999626690365456, %rax # imm = 0x1020408102040810
11252; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
11253; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm1 {%k1}
11254; AVX512DQ-BW-FCP-NEXT:    movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C
11255; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
11256; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm3 {%k1}
11257; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
11258; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, 128(%rax)
11259; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, 320(%rax)
11260; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, 192(%rax)
11261; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
11262; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm29, 256(%rax)
11263; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, 64(%rax)
11264; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm20, 384(%rax)
11265; AVX512DQ-BW-FCP-NEXT:    addq $104, %rsp
11266; AVX512DQ-BW-FCP-NEXT:    vzeroupper
11267; AVX512DQ-BW-FCP-NEXT:    retq
11268  %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64
11269  %in.vec1 = load <64 x i8>, ptr %in.vecptr1, align 64
11270  %in.vec2 = load <64 x i8>, ptr %in.vecptr2, align 64
11271  %in.vec3 = load <64 x i8>, ptr %in.vecptr3, align 64
11272  %in.vec4 = load <64 x i8>, ptr %in.vecptr4, align 64
11273  %in.vec5 = load <64 x i8>, ptr %in.vecptr5, align 64
11274  %in.vec6 = load <64 x i8>, ptr %in.vecptr6, align 64
11275  %1 = shufflevector <64 x i8> %in.vec0, <64 x i8> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
11276  %2 = shufflevector <64 x i8> %in.vec2, <64 x i8> %in.vec3, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
11277  %3 = shufflevector <64 x i8> %in.vec4, <64 x i8> %in.vec5, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
11278  %4 = shufflevector <128 x i8> %1, <128 x i8> %2, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255>
11279  %5 = shufflevector <64 x i8> %in.vec6, <64 x i8> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
11280  %6 = shufflevector <128 x i8> %3, <128 x i8> %5, <192 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191>
11281  %7 = shufflevector <192 x i8> %6, <192 x i8> poison, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
11282  %8 = shufflevector <256 x i8> %4, <256 x i8> %7, <448 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255, i32 256, i32 257, i32 258, i32 259, i32 260, i32 261, i32 262, i32 263, i32 264, i32 265, i32 266, i32 267, i32 268, i32 269, i32 270, i32 271, i32 272, i32 273, i32 274, i32 275, i32 276, i32 277, i32 278, i32 279, i32 280, i32 281, i32 282, i32 283, i32 284, i32 285, i32 286, i32 287, i32 288, i32 289, i32 290, i32 291, i32 292, i32 293, i32 294, i32 295, i32 296, i32 297, i32 298, i32 299, i32 300, i32 301, i32 302, i32 303, i32 304, i32 305, i32 306, i32 307, i32 308, i32 309, i32 310, i32 311, i32 312, i32 313, i32 314, i32 315, i32 316, i32 317, i32 318, i32 319, i32 320, i32 321, i32 322, i32 323, i32 324, i32 325, i32 326, i32 327, i32 328, i32 329, i32 330, i32 331, i32 332, i32 333, i32 334, i32 335, i32 336, i32 337, i32 338, i32 339, i32 340, i32 341, i32 342, i32 343, i32 344, i32 345, i32 346, i32 347, i32 348, i32 349, i32 350, i32 351, i32 352, i32 353, i32 354, i32 355, i32 356, i32 357, i32 358, i32 359, i32 360, i32 361, i32 362, i32 363, i32 364, i32 365, i32 366, i32 367, i32 368, i32 369, i32 370, i32 371, i32 372, i32 373, i32 374, i32 375, i32 376, i32 377, i32 378, i32 379, i32 380, i32 381, i32 382, i32 383, i32 384, i32 385, i32 386, i32 387, i32 388, i32 389, i32 390, i32 391, i32 392, i32 393, i32 394, i32 395, i32 396, i32 397, i32 398, i32 399, i32 400, i32 401, i32 402, i32 403, i32 404, i32 405, i32 406, i32 407, i32 408, i32 409, i32 410, i32 411, i32 412, i32 413, i32 414, i32 415, i32 416, i32 417, i32 418, i32 419, i32 420, i32 421, i32 422, i32 423, i32 424, i32 425, i32 426, i32 427, i32 428, i32 429, i32 430, i32 431, i32 432, i32 433, i32 434, i32 435, i32 436, i32 437, i32 438, i32 439, i32 440, i32 441, i32 442, i32 443, i32 444, i32 445, i32 446, i32 447>
11283  %interleaved.vec = shufflevector <448 x i8> %8, <448 x i8> poison, <448 x i32> <i32 0, i32 64, i32 128, i32 192, i32 256, i32 320, i32 384, i32 1, i32 65, i32 129, i32 193, i32 257, i32 321, i32 385, i32 2, i32 66, i32 130, i32 194, i32 258, i32 322, i32 386, i32 3, i32 67, i32 131, i32 195, i32 259, i32 323, i32 387, i32 4, i32 68, i32 132, i32 196, i32 260, i32 324, i32 388, i32 5, i32 69, i32 133, i32 197, i32 261, i32 325, i32 389, i32 6, i32 70, i32 134, i32 198, i32 262, i32 326, i32 390, i32 7, i32 71, i32 135, i32 199, i32 263, i32 327, i32 391, i32 8, i32 72, i32 136, i32 200, i32 264, i32 328, i32 392, i32 9, i32 73, i32 137, i32 201, i32 265, i32 329, i32 393, i32 10, i32 74, i32 138, i32 202, i32 266, i32 330, i32 394, i32 11, i32 75, i32 139, i32 203, i32 267, i32 331, i32 395, i32 12, i32 76, i32 140, i32 204, i32 268, i32 332, i32 396, i32 13, i32 77, i32 141, i32 205, i32 269, i32 333, i32 397, i32 14, i32 78, i32 142, i32 206, i32 270, i32 334, i32 398, i32 15, i32 79, i32 143, i32 207, i32 271, i32 335, i32 399, i32 16, i32 80, i32 144, i32 208, i32 272, i32 336, i32 400, i32 17, i32 81, i32 145, i32 209, i32 273, i32 337, i32 401, i32 18, i32 82, i32 146, i32 210, i32 274, i32 338, i32 402, i32 19, i32 83, i32 147, i32 211, i32 275, i32 339, i32 403, i32 20, i32 84, i32 148, i32 212, i32 276, i32 340, i32 404, i32 21, i32 85, i32 149, i32 213, i32 277, i32 341, i32 405, i32 22, i32 86, i32 150, i32 214, i32 278, i32 342, i32 406, i32 23, i32 87, i32 151, i32 215, i32 279, i32 343, i32 407, i32 24, i32 88, i32 152, i32 216, i32 280, i32 344, i32 408, i32 25, i32 89, i32 153, i32 217, i32 281, i32 345, i32 409, i32 26, i32 90, i32 154, i32 218, i32 282, i32 346, i32 410, i32 27, i32 91, i32 155, i32 219, i32 283, i32 347, i32 411, i32 28, i32 92, i32 156, i32 220, i32 284, i32 348, i32 412, i32 29, i32 93, i32 157, i32 221, i32 285, i32 349, i32 413, i32 30, i32 94, i32 158, i32 222, i32 286, i32 350, i32 414, i32 31, i32 95, i32 159, i32 223, i32 287, i32 351, i32 415, i32 32, i32 96, i32 160, i32 224, i32 288, i32 352, i32 416, i32 33, i32 97, i32 161, i32 225, i32 289, i32 353, i32 417, i32 34, i32 98, i32 162, i32 226, i32 290, i32 354, i32 418, i32 35, i32 99, i32 163, i32 227, i32 291, i32 355, i32 419, i32 36, i32 100, i32 164, i32 228, i32 292, i32 356, i32 420, i32 37, i32 101, i32 165, i32 229, i32 293, i32 357, i32 421, i32 38, i32 102, i32 166, i32 230, i32 294, i32 358, i32 422, i32 39, i32 103, i32 167, i32 231, i32 295, i32 359, i32 423, i32 40, i32 104, i32 168, i32 232, i32 296, i32 360, i32 424, i32 41, i32 105, i32 169, i32 233, i32 297, i32 361, i32 425, i32 42, i32 106, i32 170, i32 234, i32 298, i32 362, i32 426, i32 43, i32 107, i32 171, i32 235, i32 299, i32 363, i32 427, i32 44, i32 108, i32 172, i32 236, i32 300, i32 364, i32 428, i32 45, i32 109, i32 173, i32 237, i32 301, i32 365, i32 429, i32 46, i32 110, i32 174, i32 238, i32 302, i32 366, i32 430, i32 47, i32 111, i32 175, i32 239, i32 303, i32 367, i32 431, i32 48, i32 112, i32 176, i32 240, i32 304, i32 368, i32 432, i32 49, i32 113, i32 177, i32 241, i32 305, i32 369, i32 433, i32 50, i32 114, i32 178, i32 242, i32 306, i32 370, i32 434, i32 51, i32 115, i32 179, i32 243, i32 307, i32 371, i32 435, i32 52, i32 116, i32 180, i32 244, i32 308, i32 372, i32 436, i32 53, i32 117, i32 181, i32 245, i32 309, i32 373, i32 437, i32 54, i32 118, i32 182, i32 246, i32 310, i32 374, i32 438, i32 55, i32 119, i32 183, i32 247, i32 311, i32 375, i32 439, i32 56, i32 120, i32 184, i32 248, i32 312, i32 376, i32 440, i32 57, i32 121, i32 185, i32 249, i32 313, i32 377, i32 441, i32 58, i32 122, i32 186, i32 250, i32 314, i32 378, i32 442, i32 59, i32 123, i32 187, i32 251, i32 315, i32 379, i32 443, i32 60, i32 124, i32 188, i32 252, i32 316, i32 380, i32 444, i32 61, i32 125, i32 189, i32 253, i32 317, i32 381, i32 445, i32 62, i32 126, i32 190, i32 254, i32 318, i32 382, i32 446, i32 63, i32 127, i32 191, i32 255, i32 319, i32 383, i32 447>
11284  store <448 x i8> %interleaved.vec, ptr %out.vec, align 64
11285  ret void
11286}
11287