xref: /llvm-project/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll (revision 8ac00ca4867835cacaf013f5c442658b9b1bce38)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx  | FileCheck %s --check-prefixes=AVX
4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
15
16; These patterns are produced by LoopVectorizer for interleaved stores.
17
18define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind {
19; SSE-LABEL: store_i8_stride5_vf2:
20; SSE:       # %bb.0:
21; SSE-NEXT:    movdqa (%rdi), %xmm0
22; SSE-NEXT:    movdqa (%rdx), %xmm1
23; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
24; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
25; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
26; SSE-NEXT:    pxor %xmm1, %xmm1
27; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
28; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,1,3,4,5,6,7]
29; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
30; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4]
31; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
32; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
33; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,7,5]
34; SSE-NEXT:    packuswb %xmm0, %xmm1
35; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,255,255]
36; SSE-NEXT:    pand %xmm0, %xmm1
37; SSE-NEXT:    pshufd {{.*#+}} xmm2 = mem[0,0,0,0]
38; SSE-NEXT:    pandn %xmm2, %xmm0
39; SSE-NEXT:    por %xmm1, %xmm0
40; SSE-NEXT:    movq %xmm0, (%r9)
41; SSE-NEXT:    pextrw $4, %xmm0, %eax
42; SSE-NEXT:    movw %ax, 8(%r9)
43; SSE-NEXT:    retq
44;
45; AVX-LABEL: store_i8_stride5_vf2:
46; AVX:       # %bb.0:
47; AVX-NEXT:    vmovdqa (%rdi), %xmm0
48; AVX-NEXT:    vmovdqa (%rdx), %xmm1
49; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
50; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
51; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
52; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
53; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u]
54; AVX-NEXT:    vpextrw $4, %xmm0, 8(%r9)
55; AVX-NEXT:    vmovq %xmm0, (%r9)
56; AVX-NEXT:    retq
57;
58; AVX2-LABEL: store_i8_stride5_vf2:
59; AVX2:       # %bb.0:
60; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
61; AVX2-NEXT:    vmovdqa (%rdx), %xmm1
62; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
63; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
64; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
65; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
66; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u]
67; AVX2-NEXT:    vpextrw $4, %xmm0, 8(%r9)
68; AVX2-NEXT:    vmovq %xmm0, (%r9)
69; AVX2-NEXT:    retq
70;
71; AVX2-FP-LABEL: store_i8_stride5_vf2:
72; AVX2-FP:       # %bb.0:
73; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
74; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm1
75; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
76; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
77; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
78; AVX2-FP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
79; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u]
80; AVX2-FP-NEXT:    vpextrw $4, %xmm0, 8(%r9)
81; AVX2-FP-NEXT:    vmovq %xmm0, (%r9)
82; AVX2-FP-NEXT:    retq
83;
84; AVX2-FCP-LABEL: store_i8_stride5_vf2:
85; AVX2-FCP:       # %bb.0:
86; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm0
87; AVX2-FCP-NEXT:    vmovdqa (%rdx), %xmm1
88; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
89; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
90; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
91; AVX2-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
92; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u]
93; AVX2-FCP-NEXT:    vpextrw $4, %xmm0, 8(%r9)
94; AVX2-FCP-NEXT:    vmovq %xmm0, (%r9)
95; AVX2-FCP-NEXT:    retq
96;
97; AVX512-LABEL: store_i8_stride5_vf2:
98; AVX512:       # %bb.0:
99; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
100; AVX512-NEXT:    vmovdqa (%rdx), %xmm1
101; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
102; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
103; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
104; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
105; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u]
106; AVX512-NEXT:    vpextrw $4, %xmm0, 8(%r9)
107; AVX512-NEXT:    vmovq %xmm0, (%r9)
108; AVX512-NEXT:    retq
109;
110; AVX512-FCP-LABEL: store_i8_stride5_vf2:
111; AVX512-FCP:       # %bb.0:
112; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
113; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm1
114; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
115; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
116; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
117; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
118; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u]
119; AVX512-FCP-NEXT:    vpextrw $4, %xmm0, 8(%r9)
120; AVX512-FCP-NEXT:    vmovq %xmm0, (%r9)
121; AVX512-FCP-NEXT:    retq
122;
123; AVX512DQ-LABEL: store_i8_stride5_vf2:
124; AVX512DQ:       # %bb.0:
125; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
126; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm1
127; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
128; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
129; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
130; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
131; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u]
132; AVX512DQ-NEXT:    vpextrw $4, %xmm0, 8(%r9)
133; AVX512DQ-NEXT:    vmovq %xmm0, (%r9)
134; AVX512DQ-NEXT:    retq
135;
136; AVX512DQ-FCP-LABEL: store_i8_stride5_vf2:
137; AVX512DQ-FCP:       # %bb.0:
138; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
139; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm1
140; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
141; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
142; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
143; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
144; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u]
145; AVX512DQ-FCP-NEXT:    vpextrw $4, %xmm0, 8(%r9)
146; AVX512DQ-FCP-NEXT:    vmovq %xmm0, (%r9)
147; AVX512DQ-FCP-NEXT:    retq
148;
149; AVX512BW-LABEL: store_i8_stride5_vf2:
150; AVX512BW:       # %bb.0:
151; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
152; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm1
153; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
154; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
155; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
156; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
157; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u]
158; AVX512BW-NEXT:    vpextrw $4, %xmm0, 8(%r9)
159; AVX512BW-NEXT:    vmovq %xmm0, (%r9)
160; AVX512BW-NEXT:    retq
161;
162; AVX512BW-FCP-LABEL: store_i8_stride5_vf2:
163; AVX512BW-FCP:       # %bb.0:
164; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
165; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %xmm1
166; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
167; AVX512BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
168; AVX512BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
169; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
170; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u]
171; AVX512BW-FCP-NEXT:    vpextrw $4, %xmm0, 8(%r9)
172; AVX512BW-FCP-NEXT:    vmovq %xmm0, (%r9)
173; AVX512BW-FCP-NEXT:    retq
174;
175; AVX512DQ-BW-LABEL: store_i8_stride5_vf2:
176; AVX512DQ-BW:       # %bb.0:
177; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
178; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm1
179; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
180; AVX512DQ-BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
181; AVX512DQ-BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
182; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
183; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u]
184; AVX512DQ-BW-NEXT:    vpextrw $4, %xmm0, 8(%r9)
185; AVX512DQ-BW-NEXT:    vmovq %xmm0, (%r9)
186; AVX512DQ-BW-NEXT:    retq
187;
188; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf2:
189; AVX512DQ-BW-FCP:       # %bb.0:
190; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
191; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %xmm1
192; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
193; AVX512DQ-BW-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
194; AVX512DQ-BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
195; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
196; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u]
197; AVX512DQ-BW-FCP-NEXT:    vpextrw $4, %xmm0, 8(%r9)
198; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, (%r9)
199; AVX512DQ-BW-FCP-NEXT:    retq
200  %in.vec0 = load <2 x i8>, ptr %in.vecptr0, align 64
201  %in.vec1 = load <2 x i8>, ptr %in.vecptr1, align 64
202  %in.vec2 = load <2 x i8>, ptr %in.vecptr2, align 64
203  %in.vec3 = load <2 x i8>, ptr %in.vecptr3, align 64
204  %in.vec4 = load <2 x i8>, ptr %in.vecptr4, align 64
205  %1 = shufflevector <2 x i8> %in.vec0, <2 x i8> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
206  %2 = shufflevector <2 x i8> %in.vec2, <2 x i8> %in.vec3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
207  %3 = shufflevector <4 x i8> %1, <4 x i8> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
208  %4 = shufflevector <2 x i8> %in.vec4, <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
209  %5 = shufflevector <8 x i8> %3, <8 x i8> %4, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
210  %interleaved.vec = shufflevector <10 x i8> %5, <10 x i8> poison, <10 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 1, i32 3, i32 5, i32 7, i32 9>
211  store <10 x i8> %interleaved.vec, ptr %out.vec, align 64
212  ret void
213}
214
215define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind {
216; SSE-LABEL: store_i8_stride5_vf4:
217; SSE:       # %bb.0:
218; SSE-NEXT:    movdqa (%rdi), %xmm1
219; SSE-NEXT:    movdqa (%rdx), %xmm2
220; SSE-NEXT:    movdqa (%r8), %xmm0
221; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
222; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
223; SSE-NEXT:    pxor %xmm3, %xmm3
224; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
225; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[3,1,2,1]
226; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,0,4,5,6,7]
227; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
228; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
229; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,1,1,3]
230; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7]
231; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
232; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
233; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
234; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,2,0,0]
235; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,2,4,5,6,7]
236; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [65535,65535,0,0,65535,65535,65535,0]
237; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,2,2,0]
238; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
239; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,5,7]
240; SSE-NEXT:    pand %xmm5, %xmm6
241; SSE-NEXT:    pandn %xmm4, %xmm5
242; SSE-NEXT:    por %xmm6, %xmm5
243; SSE-NEXT:    packuswb %xmm3, %xmm5
244; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
245; SSE-NEXT:    pand %xmm3, %xmm5
246; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0]
247; SSE-NEXT:    pandn %xmm4, %xmm3
248; SSE-NEXT:    por %xmm5, %xmm3
249; SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
250; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
251; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
252; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
253; SSE-NEXT:    por %xmm1, %xmm2
254; SSE-NEXT:    packuswb %xmm2, %xmm2
255; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
256; SSE-NEXT:    pand %xmm1, %xmm2
257; SSE-NEXT:    pandn %xmm0, %xmm1
258; SSE-NEXT:    por %xmm2, %xmm1
259; SSE-NEXT:    movd %xmm1, 16(%r9)
260; SSE-NEXT:    movdqa %xmm3, (%r9)
261; SSE-NEXT:    retq
262;
263; AVX-LABEL: store_i8_stride5_vf4:
264; AVX:       # %bb.0:
265; AVX-NEXT:    vmovdqa (%rdi), %xmm0
266; AVX-NEXT:    vmovdqa (%rdx), %xmm1
267; AVX-NEXT:    vmovdqa (%r8), %xmm2
268; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
269; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
270; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
271; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,4,8,12],zero,xmm0[1,5,9,13],zero,xmm0[2,6,10,14],zero,xmm0[3]
272; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,xmm2[2],zero
273; AVX-NEXT:    vpor %xmm3, %xmm1, %xmm1
274; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6]
275; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
276; AVX-NEXT:    vmovd %xmm0, 16(%r9)
277; AVX-NEXT:    vmovdqa %xmm1, (%r9)
278; AVX-NEXT:    retq
279;
280; AVX2-LABEL: store_i8_stride5_vf4:
281; AVX2:       # %bb.0:
282; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
283; AVX2-NEXT:    vmovdqa (%rdx), %xmm1
284; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
285; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
286; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
287; AVX2-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
288; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
289; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
290; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[2],zero,ymm0[23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
291; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
292; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
293; AVX2-NEXT:    vmovd %xmm1, 16(%r9)
294; AVX2-NEXT:    vmovdqa %xmm0, (%r9)
295; AVX2-NEXT:    vzeroupper
296; AVX2-NEXT:    retq
297;
298; AVX2-FP-LABEL: store_i8_stride5_vf4:
299; AVX2-FP:       # %bb.0:
300; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
301; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm1
302; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
303; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
304; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
305; AVX2-FP-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
306; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
307; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
308; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[2],zero,ymm0[23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
309; AVX2-FP-NEXT:    vpor %ymm0, %ymm1, %ymm0
310; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm1
311; AVX2-FP-NEXT:    vmovd %xmm1, 16(%r9)
312; AVX2-FP-NEXT:    vmovdqa %xmm0, (%r9)
313; AVX2-FP-NEXT:    vzeroupper
314; AVX2-FP-NEXT:    retq
315;
316; AVX2-FCP-LABEL: store_i8_stride5_vf4:
317; AVX2-FCP:       # %bb.0:
318; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm0
319; AVX2-FCP-NEXT:    vmovdqa (%rdx), %xmm1
320; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
321; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
322; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
323; AVX2-FCP-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
324; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
325; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
326; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[2],zero,ymm0[23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
327; AVX2-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
328; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
329; AVX2-FCP-NEXT:    vmovd %xmm1, 16(%r9)
330; AVX2-FCP-NEXT:    vmovdqa %xmm0, (%r9)
331; AVX2-FCP-NEXT:    vzeroupper
332; AVX2-FCP-NEXT:    retq
333;
334; AVX512-LABEL: store_i8_stride5_vf4:
335; AVX512:       # %bb.0:
336; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
337; AVX512-NEXT:    vmovdqa (%rdx), %xmm1
338; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
339; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
340; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
341; AVX512-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
342; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19,u,u,u,u,u,u,u,u,u,u,u,u]
343; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
344; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[2],zero,ymm0[23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
345; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
346; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
347; AVX512-NEXT:    vmovd %xmm1, 16(%r9)
348; AVX512-NEXT:    vmovdqa %xmm0, (%r9)
349; AVX512-NEXT:    vzeroupper
350; AVX512-NEXT:    retq
351;
352; AVX512-FCP-LABEL: store_i8_stride5_vf4:
353; AVX512-FCP:       # %bb.0:
354; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
355; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm1
356; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
357; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
358; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
359; AVX512-FCP-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
360; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19,u,u,u,u,u,u,u,u,u,u,u,u]
361; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
362; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[2],zero,ymm0[23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
363; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
364; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
365; AVX512-FCP-NEXT:    vmovd %xmm1, 16(%r9)
366; AVX512-FCP-NEXT:    vmovdqa %xmm0, (%r9)
367; AVX512-FCP-NEXT:    vzeroupper
368; AVX512-FCP-NEXT:    retq
369;
370; AVX512DQ-LABEL: store_i8_stride5_vf4:
371; AVX512DQ:       # %bb.0:
372; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
373; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm1
374; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
375; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
376; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
377; AVX512DQ-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
378; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19,u,u,u,u,u,u,u,u,u,u,u,u]
379; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
380; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[2],zero,ymm0[23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
381; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
382; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
383; AVX512DQ-NEXT:    vmovd %xmm1, 16(%r9)
384; AVX512DQ-NEXT:    vmovdqa %xmm0, (%r9)
385; AVX512DQ-NEXT:    vzeroupper
386; AVX512DQ-NEXT:    retq
387;
388; AVX512DQ-FCP-LABEL: store_i8_stride5_vf4:
389; AVX512DQ-FCP:       # %bb.0:
390; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
391; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm1
392; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
393; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
394; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
395; AVX512DQ-FCP-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
396; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19,u,u,u,u,u,u,u,u,u,u,u,u]
397; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
398; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[2],zero,ymm0[23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
399; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
400; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
401; AVX512DQ-FCP-NEXT:    vmovd %xmm1, 16(%r9)
402; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%r9)
403; AVX512DQ-FCP-NEXT:    vzeroupper
404; AVX512DQ-FCP-NEXT:    retq
405;
406; AVX512BW-LABEL: store_i8_stride5_vf4:
407; AVX512BW:       # %bb.0:
408; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
409; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm1
410; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
411; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
412; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
413; AVX512BW-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
414; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
415; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
416; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[2],zero,ymm0[23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
417; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
418; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
419; AVX512BW-NEXT:    vmovd %xmm1, 16(%r9)
420; AVX512BW-NEXT:    vmovdqa %xmm0, (%r9)
421; AVX512BW-NEXT:    vzeroupper
422; AVX512BW-NEXT:    retq
423;
424; AVX512BW-FCP-LABEL: store_i8_stride5_vf4:
425; AVX512BW-FCP:       # %bb.0:
426; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
427; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %xmm1
428; AVX512BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
429; AVX512BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
430; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
431; AVX512BW-FCP-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
432; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
433; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
434; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[2],zero,ymm0[23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
435; AVX512BW-FCP-NEXT:    vpor %ymm1, %ymm0, %ymm0
436; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
437; AVX512BW-FCP-NEXT:    vmovd %xmm1, 16(%r9)
438; AVX512BW-FCP-NEXT:    vmovdqa %xmm0, (%r9)
439; AVX512BW-FCP-NEXT:    vzeroupper
440; AVX512BW-FCP-NEXT:    retq
441;
442; AVX512DQ-BW-LABEL: store_i8_stride5_vf4:
443; AVX512DQ-BW:       # %bb.0:
444; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
445; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm1
446; AVX512DQ-BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
447; AVX512DQ-BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
448; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
449; AVX512DQ-BW-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
450; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
451; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
452; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[2],zero,ymm0[23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
453; AVX512DQ-BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
454; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
455; AVX512DQ-BW-NEXT:    vmovd %xmm1, 16(%r9)
456; AVX512DQ-BW-NEXT:    vmovdqa %xmm0, (%r9)
457; AVX512DQ-BW-NEXT:    vzeroupper
458; AVX512DQ-BW-NEXT:    retq
459;
460; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf4:
461; AVX512DQ-BW-FCP:       # %bb.0:
462; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
463; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %xmm1
464; AVX512DQ-BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
465; AVX512DQ-BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
466; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
467; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
468; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
469; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
470; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[2],zero,ymm0[23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
471; AVX512DQ-BW-FCP-NEXT:    vpor %ymm1, %ymm0, %ymm0
472; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
473; AVX512DQ-BW-FCP-NEXT:    vmovd %xmm1, 16(%r9)
474; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm0, (%r9)
475; AVX512DQ-BW-FCP-NEXT:    vzeroupper
476; AVX512DQ-BW-FCP-NEXT:    retq
477  %in.vec0 = load <4 x i8>, ptr %in.vecptr0, align 64
478  %in.vec1 = load <4 x i8>, ptr %in.vecptr1, align 64
479  %in.vec2 = load <4 x i8>, ptr %in.vecptr2, align 64
480  %in.vec3 = load <4 x i8>, ptr %in.vecptr3, align 64
481  %in.vec4 = load <4 x i8>, ptr %in.vecptr4, align 64
482  %1 = shufflevector <4 x i8> %in.vec0, <4 x i8> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
483  %2 = shufflevector <4 x i8> %in.vec2, <4 x i8> %in.vec3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
484  %3 = shufflevector <8 x i8> %1, <8 x i8> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
485  %4 = shufflevector <4 x i8> %in.vec4, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
486  %5 = shufflevector <16 x i8> %3, <16 x i8> %4, <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
487  %interleaved.vec = shufflevector <20 x i8> %5, <20 x i8> poison, <20 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 1, i32 5, i32 9, i32 13, i32 17, i32 2, i32 6, i32 10, i32 14, i32 18, i32 3, i32 7, i32 11, i32 15, i32 19>
488  store <20 x i8> %interleaved.vec, ptr %out.vec, align 64
489  ret void
490}
491
492define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind {
493; SSE-LABEL: store_i8_stride5_vf8:
494; SSE:       # %bb.0:
495; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
496; SSE-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
497; SSE-NEXT:    movq {{.*#+}} xmm4 = mem[0],zero
498; SSE-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
499; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
500; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
501; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,0,65535,65535,0]
502; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
503; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[2,1,2,3]
504; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,0,4,5,6,7]
505; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
506; SSE-NEXT:    pand %xmm8, %xmm6
507; SSE-NEXT:    pandn %xmm5, %xmm8
508; SSE-NEXT:    por %xmm6, %xmm8
509; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255]
510; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm4[1,2,2,3,4,5,6,7]
511; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
512; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
513; SSE-NEXT:    movdqa %xmm6, %xmm10
514; SSE-NEXT:    pandn %xmm5, %xmm10
515; SSE-NEXT:    movdqa %xmm2, %xmm7
516; SSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
517; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[2,1,2,3]
518; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,1,0,3,4,5,6,7]
519; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,6]
520; SSE-NEXT:    pand %xmm6, %xmm5
521; SSE-NEXT:    por %xmm10, %xmm5
522; SSE-NEXT:    pand %xmm9, %xmm5
523; SSE-NEXT:    pandn %xmm8, %xmm9
524; SSE-NEXT:    por %xmm5, %xmm9
525; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
526; SSE-NEXT:    pand %xmm8, %xmm9
527; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[0,0,1,1]
528; SSE-NEXT:    movdqa %xmm8, %xmm5
529; SSE-NEXT:    pandn %xmm10, %xmm5
530; SSE-NEXT:    por %xmm9, %xmm5
531; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm4[1,0,2,3,4,5,6,7]
532; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
533; SSE-NEXT:    pand %xmm8, %xmm9
534; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm3[1,0,2,3,4,5,6,7]
535; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,1,0,1]
536; SSE-NEXT:    pandn %xmm10, %xmm8
537; SSE-NEXT:    por %xmm9, %xmm8
538; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255]
539; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm1[0,0,0,0]
540; SSE-NEXT:    pand %xmm6, %xmm10
541; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7]
542; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,0,1,3]
543; SSE-NEXT:    pandn %xmm7, %xmm6
544; SSE-NEXT:    por %xmm10, %xmm6
545; SSE-NEXT:    pand %xmm9, %xmm6
546; SSE-NEXT:    pandn %xmm8, %xmm9
547; SSE-NEXT:    por %xmm6, %xmm9
548; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
549; SSE-NEXT:    pand %xmm6, %xmm9
550; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[0,0,0,0]
551; SSE-NEXT:    pandn %xmm7, %xmm6
552; SSE-NEXT:    por %xmm9, %xmm6
553; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,3,3,3,4,5,6,7]
554; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [255,0,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
555; SSE-NEXT:    pand %xmm7, %xmm4
556; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
557; SSE-NEXT:    pandn %xmm3, %xmm7
558; SSE-NEXT:    por %xmm4, %xmm7
559; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255]
560; SSE-NEXT:    pand %xmm3, %xmm7
561; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
562; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,7,7,7]
563; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
564; SSE-NEXT:    pandn %xmm1, %xmm3
565; SSE-NEXT:    por %xmm7, %xmm3
566; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,0,255,255,255,255,255,255,255,255]
567; SSE-NEXT:    pand %xmm1, %xmm3
568; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
569; SSE-NEXT:    pandn %xmm0, %xmm1
570; SSE-NEXT:    por %xmm3, %xmm1
571; SSE-NEXT:    movq %xmm1, 32(%r9)
572; SSE-NEXT:    movdqa %xmm6, (%r9)
573; SSE-NEXT:    movdqa %xmm5, 16(%r9)
574; SSE-NEXT:    retq
575;
576; AVX-LABEL: store_i8_stride5_vf8:
577; AVX:       # %bb.0:
578; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
579; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
580; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
581; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
582; AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
583; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
584; AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
585; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
586; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u]
587; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
588; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
589; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,2,13,6,8,10,12,15,u,u,u,u,u,u,u,u]
590; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,xmm1[0,8,u],zero,zero,xmm1[1,9,u],zero,zero,xmm1[2,10,u],zero
591; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[0,8],zero,zero,xmm0[u,1,9],zero,zero,xmm0[u,2,10],zero,zero,xmm0[u,3]
592; AVX-NEXT:    vpor %xmm4, %xmm5, %xmm4
593; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3],zero,xmm4[5,6,7,8],zero,xmm4[10,11,12,13],zero,xmm4[15]
594; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,xmm2[2],zero
595; AVX-NEXT:    vpor %xmm5, %xmm4, %xmm4
596; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[3,11,u],zero,zero,xmm1[4,12,u],zero,zero,xmm1[5,13,u],zero,zero
597; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[11],zero,zero,xmm0[u,4,12],zero,zero,xmm0[u,5,13],zero,zero,xmm0[u,6,14]
598; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
599; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6,7],zero,xmm0[9,10,11,12],zero,xmm0[14,15]
600; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm2[3],zero,zero,zero,zero,xmm2[4],zero,zero,zero,zero,xmm2[5],zero,zero
601; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
602; AVX-NEXT:    vmovdqa %xmm0, 16(%r9)
603; AVX-NEXT:    vmovdqa %xmm4, (%r9)
604; AVX-NEXT:    vmovq %xmm3, 32(%r9)
605; AVX-NEXT:    retq
606;
607; AVX2-LABEL: store_i8_stride5_vf8:
608; AVX2:       # %bb.0:
609; AVX2-NEXT:    movq (%r8), %rax
610; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
611; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
612; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
613; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
614; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
615; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
616; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
617; AVX2-NEXT:    vmovq %rax, %xmm3
618; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero
619; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
620; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30]
621; AVX2-NEXT:    vpor %ymm2, %ymm4, %ymm2
622; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
623; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
624; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
625; AVX2-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
626; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
627; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u]
628; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
629; AVX2-NEXT:    shrq $48, %rax
630; AVX2-NEXT:    vmovd %eax, %xmm1
631; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
632; AVX2-NEXT:    vmovq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,0,0,0,0,0,0,0,0]
633; AVX2-NEXT:    vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
634; AVX2-NEXT:    vmovq %xmm0, 32(%r9)
635; AVX2-NEXT:    vmovdqa %ymm2, (%r9)
636; AVX2-NEXT:    vzeroupper
637; AVX2-NEXT:    retq
638;
639; AVX2-FP-LABEL: store_i8_stride5_vf8:
640; AVX2-FP:       # %bb.0:
641; AVX2-FP-NEXT:    movq (%r8), %rax
642; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
643; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
644; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
645; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
646; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
647; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
648; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
649; AVX2-FP-NEXT:    vmovq %rax, %xmm3
650; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero
651; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
652; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30]
653; AVX2-FP-NEXT:    vpor %ymm2, %ymm4, %ymm2
654; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
655; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
656; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
657; AVX2-FP-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
658; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
659; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u]
660; AVX2-FP-NEXT:    vpor %xmm0, %xmm1, %xmm0
661; AVX2-FP-NEXT:    shrq $48, %rax
662; AVX2-FP-NEXT:    vmovd %eax, %xmm1
663; AVX2-FP-NEXT:    vpbroadcastw %xmm1, %xmm1
664; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,0,0,0,0,0,0,0,0]
665; AVX2-FP-NEXT:    vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
666; AVX2-FP-NEXT:    vmovq %xmm0, 32(%r9)
667; AVX2-FP-NEXT:    vmovdqa %ymm2, (%r9)
668; AVX2-FP-NEXT:    vzeroupper
669; AVX2-FP-NEXT:    retq
670;
671; AVX2-FCP-LABEL: store_i8_stride5_vf8:
672; AVX2-FCP:       # %bb.0:
673; AVX2-FCP-NEXT:    movq (%r8), %rax
674; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
675; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
676; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
677; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
678; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
679; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
680; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
681; AVX2-FCP-NEXT:    vmovq %rax, %xmm3
682; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero
683; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
684; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30]
685; AVX2-FCP-NEXT:    vpor %ymm2, %ymm4, %ymm2
686; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1]
687; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm4, %ymm3
688; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
689; AVX2-FCP-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
690; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
691; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u]
692; AVX2-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
693; AVX2-FCP-NEXT:    shrq $48, %rax
694; AVX2-FCP-NEXT:    vmovd %eax, %xmm1
695; AVX2-FCP-NEXT:    vpbroadcastw %xmm1, %xmm1
696; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,0,0,0,0,0,0,0,0]
697; AVX2-FCP-NEXT:    vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
698; AVX2-FCP-NEXT:    vmovq %xmm0, 32(%r9)
699; AVX2-FCP-NEXT:    vmovdqa %ymm2, (%r9)
700; AVX2-FCP-NEXT:    vzeroupper
701; AVX2-FCP-NEXT:    retq
702;
703; AVX512-LABEL: store_i8_stride5_vf8:
704; AVX512:       # %bb.0:
705; AVX512-NEXT:    movq (%r8), %rax
706; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
707; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
708; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
709; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
710; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
711; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
712; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
713; AVX512-NEXT:    vmovq %rax, %xmm3
714; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,ymm2[u,1,9],zero,zero,ymm2[u,2,10],zero,zero,ymm2[u,3],zero,ymm2[19,27,u],zero,zero,ymm2[20,28,u],zero,zero,ymm2[21,29,u],zero,zero
715; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
716; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u],zero,zero,ymm2[1,9,u],zero,zero,ymm2[2,10,u],zero,ymm2[27],zero,zero,ymm2[u,20,28],zero,zero,ymm2[u,21,29],zero,zero,ymm2[u,22,30]
717; AVX512-NEXT:    vpor %ymm2, %ymm4, %ymm2
718; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
719; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
720; AVX512-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
721; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
722; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u]
723; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
724; AVX512-NEXT:    shrq $48, %rax
725; AVX512-NEXT:    vmovd %eax, %xmm1
726; AVX512-NEXT:    vpbroadcastw %xmm1, %xmm1
727; AVX512-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0))
728; AVX512-NEXT:    vinserti32x4 $2, %xmm1, %zmm3, %zmm0
729; AVX512-NEXT:    vmovq %xmm1, 32(%r9)
730; AVX512-NEXT:    vmovdqa %ymm0, (%r9)
731; AVX512-NEXT:    vzeroupper
732; AVX512-NEXT:    retq
733;
734; AVX512-FCP-LABEL: store_i8_stride5_vf8:
735; AVX512-FCP:       # %bb.0:
736; AVX512-FCP-NEXT:    movq (%r8), %rax
737; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
738; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
739; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
740; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
741; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
742; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
743; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
744; AVX512-FCP-NEXT:    vmovq %rax, %xmm3
745; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,ymm2[u,1,9],zero,zero,ymm2[u,2,10],zero,zero,ymm2[u,3],zero,ymm2[19,27,u],zero,zero,ymm2[20,28,u],zero,zero,ymm2[21,29,u],zero,zero
746; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
747; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u],zero,zero,ymm2[1,9,u],zero,zero,ymm2[2,10,u],zero,ymm2[27],zero,zero,ymm2[u,20,28],zero,zero,ymm2[u,21,29],zero,zero,ymm2[u,22,30]
748; AVX512-FCP-NEXT:    vpor %ymm2, %ymm4, %ymm2
749; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1]
750; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm4, %ymm3
751; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
752; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
753; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u]
754; AVX512-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
755; AVX512-FCP-NEXT:    shrq $48, %rax
756; AVX512-FCP-NEXT:    vmovd %eax, %xmm1
757; AVX512-FCP-NEXT:    vpbroadcastw %xmm1, %xmm1
758; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0))
759; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm3, %zmm0
760; AVX512-FCP-NEXT:    vmovq %xmm1, 32(%r9)
761; AVX512-FCP-NEXT:    vmovdqa %ymm0, (%r9)
762; AVX512-FCP-NEXT:    vzeroupper
763; AVX512-FCP-NEXT:    retq
764;
765; AVX512DQ-LABEL: store_i8_stride5_vf8:
766; AVX512DQ:       # %bb.0:
767; AVX512DQ-NEXT:    movq (%r8), %rax
768; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
769; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
770; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
771; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
772; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
773; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
774; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
775; AVX512DQ-NEXT:    vmovq %rax, %xmm3
776; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,ymm2[u,1,9],zero,zero,ymm2[u,2,10],zero,zero,ymm2[u,3],zero,ymm2[19,27,u],zero,zero,ymm2[20,28,u],zero,zero,ymm2[21,29,u],zero,zero
777; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
778; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u],zero,zero,ymm2[1,9,u],zero,zero,ymm2[2,10,u],zero,ymm2[27],zero,zero,ymm2[u,20,28],zero,zero,ymm2[u,21,29],zero,zero,ymm2[u,22,30]
779; AVX512DQ-NEXT:    vpor %ymm2, %ymm4, %ymm2
780; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
781; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
782; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
783; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
784; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u]
785; AVX512DQ-NEXT:    vpor %xmm0, %xmm1, %xmm0
786; AVX512DQ-NEXT:    shrq $48, %rax
787; AVX512DQ-NEXT:    vmovd %eax, %xmm1
788; AVX512DQ-NEXT:    vpbroadcastw %xmm1, %xmm1
789; AVX512DQ-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0))
790; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm1, %zmm3, %zmm0
791; AVX512DQ-NEXT:    vmovq %xmm1, 32(%r9)
792; AVX512DQ-NEXT:    vmovdqa %ymm0, (%r9)
793; AVX512DQ-NEXT:    vzeroupper
794; AVX512DQ-NEXT:    retq
795;
796; AVX512DQ-FCP-LABEL: store_i8_stride5_vf8:
797; AVX512DQ-FCP:       # %bb.0:
798; AVX512DQ-FCP-NEXT:    movq (%r8), %rax
799; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
800; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
801; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
802; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
803; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
804; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
805; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
806; AVX512DQ-FCP-NEXT:    vmovq %rax, %xmm3
807; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,ymm2[u,1,9],zero,zero,ymm2[u,2,10],zero,zero,ymm2[u,3],zero,ymm2[19,27,u],zero,zero,ymm2[20,28,u],zero,zero,ymm2[21,29,u],zero,zero
808; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
809; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u],zero,zero,ymm2[1,9,u],zero,zero,ymm2[2,10,u],zero,ymm2[27],zero,zero,ymm2[u,20,28],zero,zero,ymm2[u,21,29],zero,zero,ymm2[u,22,30]
810; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm4, %ymm2
811; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1]
812; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm4, %ymm3
813; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
814; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
815; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u]
816; AVX512DQ-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
817; AVX512DQ-FCP-NEXT:    shrq $48, %rax
818; AVX512DQ-FCP-NEXT:    vmovd %eax, %xmm1
819; AVX512DQ-FCP-NEXT:    vpbroadcastw %xmm1, %xmm1
820; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0))
821; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm1, %zmm3, %zmm0
822; AVX512DQ-FCP-NEXT:    vmovq %xmm1, 32(%r9)
823; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, (%r9)
824; AVX512DQ-FCP-NEXT:    vzeroupper
825; AVX512DQ-FCP-NEXT:    retq
826;
827; AVX512BW-LABEL: store_i8_stride5_vf8:
828; AVX512BW:       # %bb.0:
829; AVX512BW-NEXT:    movq (%r8), %rax
830; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
831; AVX512BW-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
832; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
833; AVX512BW-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
834; AVX512BW-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
835; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
836; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
837; AVX512BW-NEXT:    vmovq %rax, %xmm3
838; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero
839; AVX512BW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
840; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30]
841; AVX512BW-NEXT:    vpor %ymm4, %ymm2, %ymm2
842; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
843; AVX512BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
844; AVX512BW-NEXT:    movl $554189328, %ecx # imm = 0x21084210
845; AVX512BW-NEXT:    kmovd %ecx, %k1
846; AVX512BW-NEXT:    vmovdqu8 %ymm3, %ymm2 {%k1}
847; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
848; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u]
849; AVX512BW-NEXT:    vpor %xmm0, %xmm1, %xmm0
850; AVX512BW-NEXT:    shrq $48, %rax
851; AVX512BW-NEXT:    vpbroadcastw %eax, %xmm1
852; AVX512BW-NEXT:    movw $132, %ax
853; AVX512BW-NEXT:    kmovd %eax, %k1
854; AVX512BW-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
855; AVX512BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm1
856; AVX512BW-NEXT:    vmovq %xmm0, 32(%r9)
857; AVX512BW-NEXT:    vmovdqa %ymm1, (%r9)
858; AVX512BW-NEXT:    vzeroupper
859; AVX512BW-NEXT:    retq
860;
861; AVX512BW-FCP-LABEL: store_i8_stride5_vf8:
862; AVX512BW-FCP:       # %bb.0:
863; AVX512BW-FCP-NEXT:    movq (%r8), %rax
864; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
865; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
866; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
867; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
868; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
869; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
870; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
871; AVX512BW-FCP-NEXT:    vmovq %rax, %xmm3
872; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero
873; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
874; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30]
875; AVX512BW-FCP-NEXT:    vpor %ymm4, %ymm2, %ymm2
876; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1]
877; AVX512BW-FCP-NEXT:    vpermd %ymm3, %ymm4, %ymm3
878; AVX512BW-FCP-NEXT:    movl $554189328, %ecx # imm = 0x21084210
879; AVX512BW-FCP-NEXT:    kmovd %ecx, %k1
880; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm3, %ymm2 {%k1}
881; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
882; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u]
883; AVX512BW-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
884; AVX512BW-FCP-NEXT:    shrq $48, %rax
885; AVX512BW-FCP-NEXT:    vpbroadcastw %eax, %xmm1
886; AVX512BW-FCP-NEXT:    movw $132, %ax
887; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
888; AVX512BW-FCP-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
889; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm1
890; AVX512BW-FCP-NEXT:    vmovq %xmm0, 32(%r9)
891; AVX512BW-FCP-NEXT:    vmovdqa %ymm1, (%r9)
892; AVX512BW-FCP-NEXT:    vzeroupper
893; AVX512BW-FCP-NEXT:    retq
894;
895; AVX512DQ-BW-LABEL: store_i8_stride5_vf8:
896; AVX512DQ-BW:       # %bb.0:
897; AVX512DQ-BW-NEXT:    movq (%r8), %rax
898; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
899; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
900; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
901; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
902; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
903; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
904; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
905; AVX512DQ-BW-NEXT:    vmovq %rax, %xmm3
906; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero
907; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
908; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30]
909; AVX512DQ-BW-NEXT:    vpor %ymm4, %ymm2, %ymm2
910; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
911; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
912; AVX512DQ-BW-NEXT:    movl $554189328, %ecx # imm = 0x21084210
913; AVX512DQ-BW-NEXT:    kmovd %ecx, %k1
914; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm3, %ymm2 {%k1}
915; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
916; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u]
917; AVX512DQ-BW-NEXT:    vpor %xmm0, %xmm1, %xmm0
918; AVX512DQ-BW-NEXT:    shrq $48, %rax
919; AVX512DQ-BW-NEXT:    vpbroadcastw %eax, %xmm1
920; AVX512DQ-BW-NEXT:    movw $132, %ax
921; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
922; AVX512DQ-BW-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
923; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm1
924; AVX512DQ-BW-NEXT:    vmovq %xmm0, 32(%r9)
925; AVX512DQ-BW-NEXT:    vmovdqa %ymm1, (%r9)
926; AVX512DQ-BW-NEXT:    vzeroupper
927; AVX512DQ-BW-NEXT:    retq
928;
929; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf8:
930; AVX512DQ-BW-FCP:       # %bb.0:
931; AVX512DQ-BW-FCP-NEXT:    movq (%r8), %rax
932; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
933; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
934; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
935; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
936; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
937; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
938; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
939; AVX512DQ-BW-FCP-NEXT:    vmovq %rax, %xmm3
940; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero
941; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
942; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30]
943; AVX512DQ-BW-FCP-NEXT:    vpor %ymm4, %ymm2, %ymm2
944; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1]
945; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm3, %ymm4, %ymm3
946; AVX512DQ-BW-FCP-NEXT:    movl $554189328, %ecx # imm = 0x21084210
947; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k1
948; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm3, %ymm2 {%k1}
949; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
950; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u]
951; AVX512DQ-BW-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
952; AVX512DQ-BW-FCP-NEXT:    shrq $48, %rax
953; AVX512DQ-BW-FCP-NEXT:    vpbroadcastw %eax, %xmm1
954; AVX512DQ-BW-FCP-NEXT:    movw $132, %ax
955; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
956; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
957; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm2, %zmm1
958; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm0, 32(%r9)
959; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm1, (%r9)
960; AVX512DQ-BW-FCP-NEXT:    vzeroupper
961; AVX512DQ-BW-FCP-NEXT:    retq
962  %in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64
963  %in.vec1 = load <8 x i8>, ptr %in.vecptr1, align 64
964  %in.vec2 = load <8 x i8>, ptr %in.vecptr2, align 64
965  %in.vec3 = load <8 x i8>, ptr %in.vecptr3, align 64
966  %in.vec4 = load <8 x i8>, ptr %in.vecptr4, align 64
967  %1 = shufflevector <8 x i8> %in.vec0, <8 x i8> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
968  %2 = shufflevector <8 x i8> %in.vec2, <8 x i8> %in.vec3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
969  %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
970  %4 = shufflevector <8 x i8> %in.vec4, <8 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
971  %5 = shufflevector <32 x i8> %3, <32 x i8> %4, <40 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
972  %interleaved.vec = shufflevector <40 x i8> %5, <40 x i8> poison, <40 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 1, i32 9, i32 17, i32 25, i32 33, i32 2, i32 10, i32 18, i32 26, i32 34, i32 3, i32 11, i32 19, i32 27, i32 35, i32 4, i32 12, i32 20, i32 28, i32 36, i32 5, i32 13, i32 21, i32 29, i32 37, i32 6, i32 14, i32 22, i32 30, i32 38, i32 7, i32 15, i32 23, i32 31, i32 39>
973  store <40 x i8> %interleaved.vec, ptr %out.vec, align 64
974  ret void
975}
976
977define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind {
978; SSE-LABEL: store_i8_stride5_vf16:
979; SSE:       # %bb.0:
980; SSE-NEXT:    movdqa (%rdi), %xmm12
981; SSE-NEXT:    movdqa (%rsi), %xmm8
982; SSE-NEXT:    movdqa (%rdx), %xmm9
983; SSE-NEXT:    movdqa (%rcx), %xmm4
984; SSE-NEXT:    movdqa (%r8), %xmm0
985; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7]
986; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
987; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
988; SSE-NEXT:    pand %xmm6, %xmm1
989; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[2,1,2,3]
990; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
991; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,0,3]
992; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
993; SSE-NEXT:    movdqa %xmm6, %xmm5
994; SSE-NEXT:    pandn %xmm3, %xmm5
995; SSE-NEXT:    por %xmm1, %xmm5
996; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255]
997; SSE-NEXT:    pand %xmm2, %xmm5
998; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm12[1,1,2,2]
999; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1000; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
1001; SSE-NEXT:    pand %xmm1, %xmm7
1002; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm8[0,1,2,1]
1003; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7]
1004; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1005; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm10[3,1,0,3,4,5,6,7]
1006; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,0,1,0]
1007; SSE-NEXT:    movdqa %xmm1, %xmm11
1008; SSE-NEXT:    pandn %xmm10, %xmm11
1009; SSE-NEXT:    por %xmm7, %xmm11
1010; SSE-NEXT:    movdqa %xmm2, %xmm10
1011; SSE-NEXT:    pandn %xmm11, %xmm10
1012; SSE-NEXT:    por %xmm5, %xmm10
1013; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
1014; SSE-NEXT:    pand %xmm7, %xmm10
1015; SSE-NEXT:    movdqa %xmm0, %xmm5
1016; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm0[1,1,2,2]
1017; SSE-NEXT:    movdqa %xmm7, %xmm0
1018; SSE-NEXT:    pandn %xmm11, %xmm0
1019; SSE-NEXT:    por %xmm10, %xmm0
1020; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1021; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm12[2,2,3,3]
1022; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
1023; SSE-NEXT:    pand %xmm12, %xmm11
1024; SSE-NEXT:    movdqa %xmm8, %xmm0
1025; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
1026; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1027; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm0[0,1,2,1]
1028; SSE-NEXT:    pshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7]
1029; SSE-NEXT:    pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,5,4,7]
1030; SSE-NEXT:    movdqa %xmm12, %xmm14
1031; SSE-NEXT:    pandn %xmm13, %xmm14
1032; SSE-NEXT:    por %xmm11, %xmm14
1033; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255]
1034; SSE-NEXT:    movdqa %xmm13, %xmm11
1035; SSE-NEXT:    pandn %xmm14, %xmm11
1036; SSE-NEXT:    pshufhw {{.*#+}} xmm14 = xmm9[0,1,2,3,5,6,6,7]
1037; SSE-NEXT:    movdqa %xmm9, %xmm10
1038; SSE-NEXT:    pshufd {{.*#+}} xmm14 = xmm14[2,2,2,2]
1039; SSE-NEXT:    movdqa %xmm1, %xmm15
1040; SSE-NEXT:    pandn %xmm14, %xmm15
1041; SSE-NEXT:    movdqa %xmm4, %xmm14
1042; SSE-NEXT:    punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm4[8],xmm14[9],xmm4[9],xmm14[10],xmm4[10],xmm14[11],xmm4[11],xmm14[12],xmm4[12],xmm14[13],xmm4[13],xmm14[14],xmm4[14],xmm14[15],xmm4[15]
1043; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[0,1,2,1]
1044; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
1045; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,4]
1046; SSE-NEXT:    pand %xmm1, %xmm0
1047; SSE-NEXT:    por %xmm15, %xmm0
1048; SSE-NEXT:    pand %xmm13, %xmm0
1049; SSE-NEXT:    por %xmm11, %xmm0
1050; SSE-NEXT:    pshufd {{.*#+}} xmm15 = xmm5[2,2,2,2]
1051; SSE-NEXT:    movdqa %xmm6, %xmm11
1052; SSE-NEXT:    pandn %xmm15, %xmm11
1053; SSE-NEXT:    pand %xmm6, %xmm0
1054; SSE-NEXT:    por %xmm0, %xmm11
1055; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1056; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,1,2,3]
1057; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,0,4,5,6,7]
1058; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
1059; SSE-NEXT:    movdqa %xmm7, %xmm15
1060; SSE-NEXT:    pandn %xmm0, %xmm15
1061; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm9[1,2,2,3,4,5,6,7]
1062; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1063; SSE-NEXT:    pand %xmm7, %xmm0
1064; SSE-NEXT:    por %xmm0, %xmm15
1065; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1066; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1]
1067; SSE-NEXT:    movdqa %xmm1, %xmm3
1068; SSE-NEXT:    pandn %xmm0, %xmm3
1069; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1070; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[2,1,2,3]
1071; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,0,3,4,5,6,7]
1072; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6]
1073; SSE-NEXT:    pand %xmm1, %xmm0
1074; SSE-NEXT:    por %xmm3, %xmm0
1075; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
1076; SSE-NEXT:    pand %xmm3, %xmm0
1077; SSE-NEXT:    pandn %xmm15, %xmm3
1078; SSE-NEXT:    por %xmm0, %xmm3
1079; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,1,1]
1080; SSE-NEXT:    movdqa %xmm12, %xmm15
1081; SSE-NEXT:    pandn %xmm0, %xmm15
1082; SSE-NEXT:    pand %xmm12, %xmm3
1083; SSE-NEXT:    por %xmm3, %xmm15
1084; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm10[1,0,2,3,4,5,6,7]
1085; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1086; SSE-NEXT:    pand %xmm12, %xmm0
1087; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm4[1,0,2,3,4,5,6,7]
1088; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
1089; SSE-NEXT:    pandn %xmm3, %xmm12
1090; SSE-NEXT:    por %xmm0, %xmm12
1091; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0]
1092; SSE-NEXT:    pand %xmm6, %xmm0
1093; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm8[0,1,2,2,4,5,6,7]
1094; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,1,3]
1095; SSE-NEXT:    pandn %xmm3, %xmm6
1096; SSE-NEXT:    por %xmm0, %xmm6
1097; SSE-NEXT:    pand %xmm13, %xmm6
1098; SSE-NEXT:    pandn %xmm12, %xmm13
1099; SSE-NEXT:    por %xmm6, %xmm13
1100; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0]
1101; SSE-NEXT:    movdqa %xmm1, %xmm6
1102; SSE-NEXT:    pandn %xmm0, %xmm6
1103; SSE-NEXT:    pand %xmm1, %xmm13
1104; SSE-NEXT:    por %xmm13, %xmm6
1105; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,6,6,7]
1106; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
1107; SSE-NEXT:    pand %xmm1, %xmm0
1108; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,5,7,6,7]
1109; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,3,2]
1110; SSE-NEXT:    pandn %xmm3, %xmm1
1111; SSE-NEXT:    por %xmm0, %xmm1
1112; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3]
1113; SSE-NEXT:    pand %xmm7, %xmm0
1114; SSE-NEXT:    pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
1115; SSE-NEXT:    # xmm3 = mem[0,1,2,3,7,5,6,6]
1116; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,2,2]
1117; SSE-NEXT:    pandn %xmm3, %xmm7
1118; SSE-NEXT:    por %xmm0, %xmm7
1119; SSE-NEXT:    pand %xmm2, %xmm7
1120; SSE-NEXT:    pandn %xmm1, %xmm2
1121; SSE-NEXT:    por %xmm7, %xmm2
1122; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
1123; SSE-NEXT:    pand %xmm0, %xmm2
1124; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3]
1125; SSE-NEXT:    pandn %xmm1, %xmm0
1126; SSE-NEXT:    por %xmm2, %xmm0
1127; SSE-NEXT:    movdqa %xmm0, 64(%r9)
1128; SSE-NEXT:    movdqa %xmm6, (%r9)
1129; SSE-NEXT:    movdqa %xmm15, 16(%r9)
1130; SSE-NEXT:    movdqa %xmm11, 48(%r9)
1131; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1132; SSE-NEXT:    movaps %xmm0, 32(%r9)
1133; SSE-NEXT:    retq
1134;
1135; AVX-LABEL: store_i8_stride5_vf16:
1136; AVX:       # %bb.0:
1137; AVX-NEXT:    vmovdqa (%rdi), %xmm2
1138; AVX-NEXT:    vmovdqa (%rsi), %xmm3
1139; AVX-NEXT:    vmovdqa (%rdx), %xmm1
1140; AVX-NEXT:    vmovdqa (%rcx), %xmm4
1141; AVX-NEXT:    vmovdqa (%r8), %xmm0
1142; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,xmm4[6,u,u,u],zero,xmm4[7,u,u,u],zero,xmm4[8,u,u,u],zero
1143; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[6],zero,xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9]
1144; AVX-NEXT:    vpor %xmm5, %xmm6, %xmm5
1145; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm3[u,u,u],zero,xmm3[7,u,u,u],zero,xmm3[8,u,u,u],zero,xmm3[9,u]
1146; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,7],zero,xmm2[u,u,u,8],zero,xmm2[u,u,u,9],zero,xmm2[u]
1147; AVX-NEXT:    vpor %xmm6, %xmm7, %xmm6
1148; AVX-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255]
1149; AVX-NEXT:    vpblendvb %xmm7, %xmm5, %xmm6, %xmm5
1150; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,1],zero,xmm5[3,4,5,6],zero,xmm5[8,9,10,11],zero,xmm5[13,14,15]
1151; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm0[6],zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,xmm0[8],zero,zero,zero
1152; AVX-NEXT:    vpor %xmm6, %xmm5, %xmm5
1153; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
1154; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[10,11],zero,zero,zero,xmm6[12,13],zero,zero,zero,xmm6[14,15],zero
1155; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
1156; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = zero,xmm7[10,11],zero,zero,zero,xmm7[12,13],zero,zero,zero,xmm7[14,15],zero,zero,zero
1157; AVX-NEXT:    vpor %xmm6, %xmm8, %xmm6
1158; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1159; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,xmm8[0,1],zero,zero,zero,xmm8[2,3],zero,zero,zero,xmm8[4,5],zero,zero
1160; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
1161; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[0,1],zero,zero,zero,xmm10[2,3],zero,zero,zero,xmm10[4,5],zero,zero,zero,xmm10[6]
1162; AVX-NEXT:    vpor %xmm9, %xmm10, %xmm9
1163; AVX-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero
1164; AVX-NEXT:    vpor %xmm10, %xmm9, %xmm9
1165; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = zero,xmm8[6,7],zero,zero,zero,xmm8[8,9],zero,zero,zero,xmm8[10,11],zero,zero,zero
1166; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
1167; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[6],zero,zero,zero,xmm2[9,8],zero,zero,zero,xmm2[11,10],zero,zero,zero,xmm2[13,12]
1168; AVX-NEXT:    vpor %xmm2, %xmm8, %xmm2
1169; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm0[3],zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,xmm0[5],zero,zero
1170; AVX-NEXT:    vpor %xmm3, %xmm2, %xmm2
1171; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm7[4,5],zero,zero,zero,xmm7[6,7],zero,zero,zero,xmm7[8,9],zero,zero
1172; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
1173; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,xmm1[5,4],zero,zero,zero,xmm1[7,6],zero,zero,zero,xmm1[9,8]
1174; AVX-NEXT:    vpor %xmm3, %xmm1, %xmm1
1175; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero
1176; AVX-NEXT:    vpor %xmm3, %xmm1, %xmm1
1177; AVX-NEXT:    vmovdqa %xmm1, 48(%r9)
1178; AVX-NEXT:    vmovdqa %xmm2, 16(%r9)
1179; AVX-NEXT:    vmovdqa %xmm9, (%r9)
1180; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15]
1181; AVX-NEXT:    vpor %xmm0, %xmm6, %xmm0
1182; AVX-NEXT:    vmovdqa %xmm0, 64(%r9)
1183; AVX-NEXT:    vmovdqa %xmm5, 32(%r9)
1184; AVX-NEXT:    retq
1185;
1186; AVX2-LABEL: store_i8_stride5_vf16:
1187; AVX2:       # %bb.0:
1188; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
1189; AVX2-NEXT:    vmovdqa (%rdx), %xmm2
1190; AVX2-NEXT:    vmovdqa (%r8), %xmm0
1191; AVX2-NEXT:    vinserti128 $1, (%rsi), %ymm1, %ymm1
1192; AVX2-NEXT:    vinserti128 $1, (%rcx), %ymm2, %ymm2
1193; AVX2-NEXT:    vpshufb {{.*#+}} ymm3 = ymm2[6],zero,zero,zero,zero,ymm2[7],zero,zero,zero,zero,ymm2[8],zero,zero,zero,zero,ymm2[9,25],zero,zero,zero,zero,ymm2[26],zero,zero,zero,zero,ymm2[27],zero,zero,zero,zero,ymm2[28]
1194; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
1195; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = zero,ymm4[6],zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero
1196; AVX2-NEXT:    vpor %ymm4, %ymm3, %ymm3
1197; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9],zero,zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28],zero,zero
1198; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1]
1199; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero,zero
1200; AVX2-NEXT:    vpor %ymm4, %ymm5, %ymm4
1201; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
1202; AVX2-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
1203; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2]
1204; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1]
1205; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
1206; AVX2-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
1207; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[1,3,2,3]
1208; AVX2-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,13],zero,zero,zero,xmm4[6,14],zero,zero,zero,xmm4[7,15],zero
1209; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[1,3,2,3]
1210; AVX2-NEXT:    vpshufb {{.*#+}} xmm5 = zero,xmm5[5,13],zero,zero,zero,xmm5[6,14],zero,zero,zero,xmm5[7,15],zero,zero,zero
1211; AVX2-NEXT:    vpor %xmm4, %xmm5, %xmm4
1212; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
1213; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero
1214; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,0]
1215; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,ymm1[1,9],zero,zero,zero,ymm1[2,10],zero,zero,zero,ymm1[3,19],zero,zero,zero,ymm1[28,20],zero,zero,zero,ymm1[29,21],zero,zero,zero,ymm1[30,22]
1216; AVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
1217; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
1218; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
1219; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
1220; AVX2-NEXT:    vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
1221; AVX2-NEXT:    vmovdqa %ymm1, (%r9)
1222; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15]
1223; AVX2-NEXT:    vpor %xmm0, %xmm4, %xmm0
1224; AVX2-NEXT:    vmovdqa %xmm0, 64(%r9)
1225; AVX2-NEXT:    vmovdqa %ymm3, 32(%r9)
1226; AVX2-NEXT:    vzeroupper
1227; AVX2-NEXT:    retq
1228;
1229; AVX2-FP-LABEL: store_i8_stride5_vf16:
1230; AVX2-FP:       # %bb.0:
1231; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm1
1232; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm2
1233; AVX2-FP-NEXT:    vmovdqa (%r8), %xmm0
1234; AVX2-FP-NEXT:    vinserti128 $1, (%rsi), %ymm1, %ymm1
1235; AVX2-FP-NEXT:    vinserti128 $1, (%rcx), %ymm2, %ymm2
1236; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm2[6],zero,zero,zero,zero,ymm2[7],zero,zero,zero,zero,ymm2[8],zero,zero,zero,zero,ymm2[9,25],zero,zero,zero,zero,ymm2[26],zero,zero,zero,zero,ymm2[27],zero,zero,zero,zero,ymm2[28]
1237; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
1238; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm4 = zero,ymm4[6],zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero
1239; AVX2-FP-NEXT:    vpor %ymm4, %ymm3, %ymm3
1240; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9],zero,zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28],zero,zero
1241; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1]
1242; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero,zero
1243; AVX2-FP-NEXT:    vpor %ymm4, %ymm5, %ymm4
1244; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
1245; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
1246; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2]
1247; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1]
1248; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
1249; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
1250; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[1,3,2,3]
1251; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,13],zero,zero,zero,xmm4[6,14],zero,zero,zero,xmm4[7,15],zero
1252; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[1,3,2,3]
1253; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,xmm5[5,13],zero,zero,zero,xmm5[6,14],zero,zero,zero,xmm5[7,15],zero,zero,zero
1254; AVX2-FP-NEXT:    vpor %xmm4, %xmm5, %xmm4
1255; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
1256; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero
1257; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,0]
1258; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,ymm1[1,9],zero,zero,zero,ymm1[2,10],zero,zero,zero,ymm1[3,19],zero,zero,zero,ymm1[28,20],zero,zero,zero,ymm1[29,21],zero,zero,zero,ymm1[30,22]
1259; AVX2-FP-NEXT:    vpor %ymm2, %ymm1, %ymm1
1260; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
1261; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
1262; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
1263; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
1264; AVX2-FP-NEXT:    vmovdqa %ymm1, (%r9)
1265; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15]
1266; AVX2-FP-NEXT:    vpor %xmm0, %xmm4, %xmm0
1267; AVX2-FP-NEXT:    vmovdqa %xmm0, 64(%r9)
1268; AVX2-FP-NEXT:    vmovdqa %ymm3, 32(%r9)
1269; AVX2-FP-NEXT:    vzeroupper
1270; AVX2-FP-NEXT:    retq
1271;
1272; AVX2-FCP-LABEL: store_i8_stride5_vf16:
1273; AVX2-FCP:       # %bb.0:
1274; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm1
1275; AVX2-FCP-NEXT:    vmovdqa (%rdx), %xmm2
1276; AVX2-FCP-NEXT:    vmovdqa (%r8), %xmm0
1277; AVX2-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm1, %ymm1
1278; AVX2-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm2, %ymm2
1279; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[1,3,2,3]
1280; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,13],zero,zero,zero,xmm3[6,14],zero,zero,zero,xmm3[7,15],zero
1281; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm1[1,3,2,3]
1282; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,xmm4[5,13],zero,zero,zero,xmm4[6,14],zero,zero,zero,xmm4[7,15],zero,zero,zero
1283; AVX2-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
1284; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2]
1285; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,ymm4[1,9],zero,zero,zero,ymm4[2,10],zero,zero,zero,ymm4[19,27],zero,zero,zero,ymm4[20,28],zero,zero,zero,ymm4[21,29],zero,zero,zero
1286; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[0,2,2,0]
1287; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[3,19],zero,zero,zero,ymm5[28,20],zero,zero,zero,ymm5[29,21],zero,zero,zero,ymm5[30,22]
1288; AVX2-FCP-NEXT:    vpor %ymm4, %ymm5, %ymm4
1289; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,0,1,1]
1290; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm5, %ymm5
1291; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
1292; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
1293; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,5,2,6,2,6,3,7]
1294; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm5, %ymm1
1295; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[3,7],zero,zero,zero,ymm1[8,12],zero,zero,zero,ymm1[9,13],zero,zero,zero,ymm1[18,22],zero,zero,zero,ymm1[19,23],zero,zero,zero,ymm1[24,28],zero,zero
1296; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,5,2,6,6,2,3,7]
1297; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm5, %ymm2
1298; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[2,6],zero,zero,zero,ymm2[3,7],zero,zero,zero,ymm2[8,12],zero,zero,zero,ymm2[9,17],zero,zero,zero,ymm2[22,18],zero,zero,zero,ymm2[23,19],zero,zero,zero,ymm2[24,28]
1299; AVX2-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm1
1300; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,1,2,2,2,2,2,2]
1301; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm2, %ymm2
1302; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
1303; AVX2-FCP-NEXT:    vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
1304; AVX2-FCP-NEXT:    vmovdqa %ymm1, 32(%r9)
1305; AVX2-FCP-NEXT:    vmovdqa %ymm4, (%r9)
1306; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15]
1307; AVX2-FCP-NEXT:    vpor %xmm0, %xmm3, %xmm0
1308; AVX2-FCP-NEXT:    vmovdqa %xmm0, 64(%r9)
1309; AVX2-FCP-NEXT:    vzeroupper
1310; AVX2-FCP-NEXT:    retq
1311;
1312; AVX512-LABEL: store_i8_stride5_vf16:
1313; AVX512:       # %bb.0:
1314; AVX512-NEXT:    vmovdqa (%rdi), %xmm1
1315; AVX512-NEXT:    vmovdqa (%rdx), %xmm2
1316; AVX512-NEXT:    vmovdqa (%r8), %xmm0
1317; AVX512-NEXT:    vinserti128 $1, (%rcx), %ymm2, %ymm2
1318; AVX512-NEXT:    vinserti128 $1, (%rsi), %ymm1, %ymm1
1319; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,7],zero,ymm1[u,u,u,8],zero,ymm1[u,u,u,9],zero,ymm1[u,u,u],zero,ymm1[26,u,u,u],zero,ymm1[27,u,u,u],zero,ymm1[28,u,u]
1320; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1]
1321; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u],zero,ymm4[7,u,u,u],zero,ymm4[8,u,u,u],zero,ymm4[9,u,u,u,26],zero,ymm4[u,u,u,27],zero,ymm4[u,u,u,28],zero,ymm4[u,u]
1322; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255]
1323; AVX512-NEXT:    vpternlogq {{.*#+}} ymm4 = ~ymm5 & (ymm4 | ymm3)
1324; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2]
1325; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero
1326; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
1327; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = ymm2[6],zero,ymm2[u,u,u,7],zero,ymm2[u,u,u,8],zero,ymm2[u,u,u,9,25,u,u,u],zero,ymm2[26,u,u,u],zero,ymm2[27,u,u,u],zero,ymm2[28]
1328; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
1329; AVX512-NEXT:    vpshufb {{.*#+}} ymm6 = zero,ymm6[6,u,u,u],zero,ymm6[7,u,u,u],zero,ymm6[8,u,u,u],zero,zero,ymm6[u,u,u,26],zero,ymm6[u,u,u,27],zero,ymm6[u,u,u,28],zero
1330; AVX512-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm5 & (ymm6 | ymm4)
1331; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0]
1332; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22]
1333; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm4
1334; AVX512-NEXT:    vporq %zmm3, %zmm4, %zmm3
1335; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0]
1336; AVX512-NEXT:    vpermd %zmm0, %zmm4, %zmm4
1337; AVX512-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3))
1338; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
1339; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u]
1340; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
1341; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,5,13],zero,zero,xmm1[u,6,14],zero,zero,xmm1[u,7,15],zero,zero,xmm1[u]
1342; AVX512-NEXT:    vpor %xmm2, %xmm1, %xmm1
1343; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero
1344; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15]
1345; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
1346; AVX512-NEXT:    vmovdqa %xmm0, 64(%r9)
1347; AVX512-NEXT:    vmovdqa64 %zmm4, (%r9)
1348; AVX512-NEXT:    vzeroupper
1349; AVX512-NEXT:    retq
1350;
1351; AVX512-FCP-LABEL: store_i8_stride5_vf16:
1352; AVX512-FCP:       # %bb.0:
1353; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm1
1354; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm2
1355; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm0
1356; AVX512-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm1, %ymm1
1357; AVX512-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm2, %ymm2
1358; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2]
1359; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero
1360; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7]
1361; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm4, %ymm4
1362; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[u,3,7],zero,zero,ymm4[u,8,12],zero,zero,ymm4[u,9,13],zero,zero,ymm4[u,18,22],zero,zero,ymm4[u,19,23],zero,zero,ymm4[u,24,28],zero,zero
1363; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
1364; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0]
1365; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22]
1366; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,5,2,6,6,2,3,7]
1367; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm5, %ymm5
1368; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[2,6,u],zero,zero,ymm5[3,7,u],zero,zero,ymm5[8,12,u],zero,zero,ymm5[9,17,u],zero,zero,ymm5[22,18,u],zero,zero,ymm5[23,19,u],zero,zero,ymm5[24,28]
1369; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
1370; AVX512-FCP-NEXT:    vporq %zmm3, %zmm4, %zmm3
1371; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm4
1372; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0]
1373; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm5, %zmm4
1374; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3))
1375; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
1376; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u]
1377; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
1378; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,5,13],zero,zero,xmm1[u,6,14],zero,zero,xmm1[u,7,15],zero,zero,xmm1[u]
1379; AVX512-FCP-NEXT:    vpor %xmm2, %xmm1, %xmm1
1380; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero
1381; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15]
1382; AVX512-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
1383; AVX512-FCP-NEXT:    vmovdqa %xmm0, 64(%r9)
1384; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%r9)
1385; AVX512-FCP-NEXT:    vzeroupper
1386; AVX512-FCP-NEXT:    retq
1387;
1388; AVX512DQ-LABEL: store_i8_stride5_vf16:
1389; AVX512DQ:       # %bb.0:
1390; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
1391; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm2
1392; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm0
1393; AVX512DQ-NEXT:    vinserti128 $1, (%rcx), %ymm2, %ymm2
1394; AVX512DQ-NEXT:    vinserti128 $1, (%rsi), %ymm1, %ymm1
1395; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,7],zero,ymm1[u,u,u,8],zero,ymm1[u,u,u,9],zero,ymm1[u,u,u],zero,ymm1[26,u,u,u],zero,ymm1[27,u,u,u],zero,ymm1[28,u,u]
1396; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1]
1397; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u],zero,ymm4[7,u,u,u],zero,ymm4[8,u,u,u],zero,ymm4[9,u,u,u,26],zero,ymm4[u,u,u,27],zero,ymm4[u,u,u,28],zero,ymm4[u,u]
1398; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255]
1399; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm4 = ~ymm5 & (ymm4 | ymm3)
1400; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2]
1401; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero
1402; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
1403; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = ymm2[6],zero,ymm2[u,u,u,7],zero,ymm2[u,u,u,8],zero,ymm2[u,u,u,9,25,u,u,u],zero,ymm2[26,u,u,u],zero,ymm2[27,u,u,u],zero,ymm2[28]
1404; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
1405; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm6 = zero,ymm6[6,u,u,u],zero,ymm6[7,u,u,u],zero,ymm6[8,u,u,u],zero,zero,ymm6[u,u,u,26],zero,ymm6[u,u,u,27],zero,ymm6[u,u,u,28],zero
1406; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm5 & (ymm6 | ymm4)
1407; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0]
1408; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22]
1409; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm4
1410; AVX512DQ-NEXT:    vporq %zmm3, %zmm4, %zmm3
1411; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0]
1412; AVX512DQ-NEXT:    vpermd %zmm0, %zmm4, %zmm4
1413; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3))
1414; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
1415; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u]
1416; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
1417; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,5,13],zero,zero,xmm1[u,6,14],zero,zero,xmm1[u,7,15],zero,zero,xmm1[u]
1418; AVX512DQ-NEXT:    vpor %xmm2, %xmm1, %xmm1
1419; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero
1420; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15]
1421; AVX512DQ-NEXT:    vpor %xmm0, %xmm1, %xmm0
1422; AVX512DQ-NEXT:    vmovdqa %xmm0, 64(%r9)
1423; AVX512DQ-NEXT:    vmovdqa64 %zmm4, (%r9)
1424; AVX512DQ-NEXT:    vzeroupper
1425; AVX512DQ-NEXT:    retq
1426;
1427; AVX512DQ-FCP-LABEL: store_i8_stride5_vf16:
1428; AVX512DQ-FCP:       # %bb.0:
1429; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm1
1430; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm2
1431; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm0
1432; AVX512DQ-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm1, %ymm1
1433; AVX512DQ-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm2, %ymm2
1434; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2]
1435; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero
1436; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7]
1437; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm4, %ymm4
1438; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[u,3,7],zero,zero,ymm4[u,8,12],zero,zero,ymm4[u,9,13],zero,zero,ymm4[u,18,22],zero,zero,ymm4[u,19,23],zero,zero,ymm4[u,24,28],zero,zero
1439; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
1440; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0]
1441; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22]
1442; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [1,5,2,6,6,2,3,7]
1443; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm5, %ymm5
1444; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[2,6,u],zero,zero,ymm5[3,7,u],zero,zero,ymm5[8,12,u],zero,zero,ymm5[9,17,u],zero,zero,ymm5[22,18,u],zero,zero,ymm5[23,19,u],zero,zero,ymm5[24,28]
1445; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
1446; AVX512DQ-FCP-NEXT:    vporq %zmm3, %zmm4, %zmm3
1447; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm4
1448; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0]
1449; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm5, %zmm4
1450; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3))
1451; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
1452; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u]
1453; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
1454; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,5,13],zero,zero,xmm1[u,6,14],zero,zero,xmm1[u,7,15],zero,zero,xmm1[u]
1455; AVX512DQ-FCP-NEXT:    vpor %xmm2, %xmm1, %xmm1
1456; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero
1457; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15]
1458; AVX512DQ-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
1459; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, 64(%r9)
1460; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, (%r9)
1461; AVX512DQ-FCP-NEXT:    vzeroupper
1462; AVX512DQ-FCP-NEXT:    retq
1463;
1464; AVX512BW-LABEL: store_i8_stride5_vf16:
1465; AVX512BW:       # %bb.0:
1466; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
1467; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm1
1468; AVX512BW-NEXT:    vmovdqa (%r8), %xmm2
1469; AVX512BW-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
1470; AVX512BW-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
1471; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm0[7],zero,zero,zero,zero,ymm0[8],zero,zero,zero,zero,ymm0[9],zero,zero,zero,zero,zero,ymm0[26],zero,zero,zero,zero,ymm0[27],zero,zero,zero,zero,ymm0[28],zero,zero
1472; AVX512BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
1473; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,ymm4[9],zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero,zero,zero
1474; AVX512BW-NEXT:    vpor %ymm4, %ymm3, %ymm3
1475; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28]
1476; AVX512BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1]
1477; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm5 = zero,ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero
1478; AVX512BW-NEXT:    vpor %ymm4, %ymm5, %ymm4
1479; AVX512BW-NEXT:    movl $831283992, %eax # imm = 0x318C6318
1480; AVX512BW-NEXT:    kmovd %eax, %k1
1481; AVX512BW-NEXT:    vmovdqu8 %ymm3, %ymm4 {%k1}
1482; AVX512BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[0,2,2,0]
1483; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,ymm3[1,9],zero,zero,zero,ymm3[2,10],zero,zero,zero,ymm3[3,19],zero,zero,zero,ymm3[28,20],zero,zero,zero,ymm3[29,21],zero,zero,zero,ymm3[30,22]
1484; AVX512BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2]
1485; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[19,27],zero,zero,zero,ymm5[20,28],zero,zero,zero,ymm5[21,29],zero,zero,zero
1486; AVX512BW-NEXT:    vpor %ymm3, %ymm5, %ymm3
1487; AVX512BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
1488; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2]
1489; AVX512BW-NEXT:    vpermd %zmm2, %zmm4, %zmm4
1490; AVX512BW-NEXT:    movabsq $595056260442243600, %rax # imm = 0x842108421084210
1491; AVX512BW-NEXT:    kmovq %rax, %k1
1492; AVX512BW-NEXT:    vmovdqu8 %zmm4, %zmm3 {%k1}
1493; AVX512BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
1494; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,13],zero,zero,zero,xmm1[6,14],zero,zero,zero,xmm1[7,15],zero
1495; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3]
1496; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[5,13],zero,zero,zero,xmm0[6,14],zero,zero,zero,xmm0[7,15],zero,zero,zero
1497; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15]
1498; AVX512BW-NEXT:    vpternlogq {{.*#+}} xmm2 = xmm2 | xmm0 | xmm1
1499; AVX512BW-NEXT:    vmovdqa %xmm2, 64(%r9)
1500; AVX512BW-NEXT:    vmovdqa64 %zmm3, (%r9)
1501; AVX512BW-NEXT:    vzeroupper
1502; AVX512BW-NEXT:    retq
1503;
1504; AVX512BW-FCP-LABEL: store_i8_stride5_vf16:
1505; AVX512BW-FCP:       # %bb.0:
1506; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
1507; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %xmm1
1508; AVX512BW-FCP-NEXT:    vmovdqa (%r8), %xmm2
1509; AVX512BW-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
1510; AVX512BW-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
1511; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,5,2,6,6,2,3,7]
1512; AVX512BW-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm3
1513; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[0,2,2,0]
1514; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
1515; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zmm3[1,9],zero,zero,zero,zmm3[2,10],zero,zero,zero,zmm3[3,19],zero,zero,zero,zmm3[28,20],zero,zero,zero,zmm3[29,21],zero,zero,zero,zmm3[30,22,34,38],zero,zero,zero,zmm3[35,39],zero,zero,zero,zmm3[40,44],zero,zero,zero,zmm3[41,49],zero,zero,zero,zmm3[54,50],zero,zero,zero,zmm3[55,51],zero,zero,zero,zmm3[56,60]
1516; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7]
1517; AVX512BW-FCP-NEXT:    vpermd %ymm0, %ymm4, %ymm4
1518; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2]
1519; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
1520; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm4 = zero,zero,zmm4[0,8],zero,zero,zero,zmm4[1,9],zero,zero,zero,zmm4[2,10],zero,zero,zero,zmm4[19,27],zero,zero,zero,zmm4[20,28],zero,zero,zero,zmm4[21,29],zero,zero,zero,zero,zero,zero,zmm4[35,39],zero,zero,zero,zmm4[40,44],zero,zero,zero,zmm4[41,45],zero,zero,zero,zmm4[50,54],zero,zero,zero,zmm4[51,55],zero,zero,zero,zmm4[56,60],zero,zero
1521; AVX512BW-FCP-NEXT:    vporq %zmm3, %zmm4, %zmm3
1522; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm4
1523; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10]
1524; AVX512BW-FCP-NEXT:    vpermd %zmm4, %zmm5, %zmm4
1525; AVX512BW-FCP-NEXT:    movabsq $595056260442243600, %rax # imm = 0x842108421084210
1526; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
1527; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm3 {%k1}
1528; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15]
1529; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
1530; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,13],zero,zero,zero,xmm1[6,14],zero,zero,zero,xmm1[7,15],zero
1531; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3]
1532; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[5,13],zero,zero,zero,xmm0[6,14],zero,zero,zero,xmm0[7,15],zero,zero,zero
1533; AVX512BW-FCP-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm0 | xmm2 | xmm1
1534; AVX512BW-FCP-NEXT:    vmovdqa %xmm0, 64(%r9)
1535; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, (%r9)
1536; AVX512BW-FCP-NEXT:    vzeroupper
1537; AVX512BW-FCP-NEXT:    retq
1538;
1539; AVX512DQ-BW-LABEL: store_i8_stride5_vf16:
1540; AVX512DQ-BW:       # %bb.0:
1541; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
1542; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm1
1543; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %xmm2
1544; AVX512DQ-BW-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
1545; AVX512DQ-BW-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
1546; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm0[7],zero,zero,zero,zero,ymm0[8],zero,zero,zero,zero,ymm0[9],zero,zero,zero,zero,zero,ymm0[26],zero,zero,zero,zero,ymm0[27],zero,zero,zero,zero,ymm0[28],zero,zero
1547; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
1548; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,ymm4[9],zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero,zero,zero
1549; AVX512DQ-BW-NEXT:    vpor %ymm4, %ymm3, %ymm3
1550; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28]
1551; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1]
1552; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm5 = zero,ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero
1553; AVX512DQ-BW-NEXT:    vpor %ymm4, %ymm5, %ymm4
1554; AVX512DQ-BW-NEXT:    movl $831283992, %eax # imm = 0x318C6318
1555; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
1556; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm3, %ymm4 {%k1}
1557; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[0,2,2,0]
1558; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,ymm3[1,9],zero,zero,zero,ymm3[2,10],zero,zero,zero,ymm3[3,19],zero,zero,zero,ymm3[28,20],zero,zero,zero,ymm3[29,21],zero,zero,zero,ymm3[30,22]
1559; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2]
1560; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[19,27],zero,zero,zero,ymm5[20,28],zero,zero,zero,ymm5[21,29],zero,zero,zero
1561; AVX512DQ-BW-NEXT:    vpor %ymm3, %ymm5, %ymm3
1562; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
1563; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2]
1564; AVX512DQ-BW-NEXT:    vpermd %zmm2, %zmm4, %zmm4
1565; AVX512DQ-BW-NEXT:    movabsq $595056260442243600, %rax # imm = 0x842108421084210
1566; AVX512DQ-BW-NEXT:    kmovq %rax, %k1
1567; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm4, %zmm3 {%k1}
1568; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
1569; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,13],zero,zero,zero,xmm1[6,14],zero,zero,zero,xmm1[7,15],zero
1570; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3]
1571; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[5,13],zero,zero,zero,xmm0[6,14],zero,zero,zero,xmm0[7,15],zero,zero,zero
1572; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15]
1573; AVX512DQ-BW-NEXT:    vpternlogq {{.*#+}} xmm2 = xmm2 | xmm0 | xmm1
1574; AVX512DQ-BW-NEXT:    vmovdqa %xmm2, 64(%r9)
1575; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, (%r9)
1576; AVX512DQ-BW-NEXT:    vzeroupper
1577; AVX512DQ-BW-NEXT:    retq
1578;
1579; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf16:
1580; AVX512DQ-BW-FCP:       # %bb.0:
1581; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
1582; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %xmm1
1583; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r8), %xmm2
1584; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
1585; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
1586; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,5,2,6,6,2,3,7]
1587; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm3
1588; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[0,2,2,0]
1589; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
1590; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zmm3[1,9],zero,zero,zero,zmm3[2,10],zero,zero,zero,zmm3[3,19],zero,zero,zero,zmm3[28,20],zero,zero,zero,zmm3[29,21],zero,zero,zero,zmm3[30,22,34,38],zero,zero,zero,zmm3[35,39],zero,zero,zero,zmm3[40,44],zero,zero,zero,zmm3[41,49],zero,zero,zero,zmm3[54,50],zero,zero,zero,zmm3[55,51],zero,zero,zero,zmm3[56,60]
1591; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7]
1592; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm0, %ymm4, %ymm4
1593; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2]
1594; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
1595; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm4 = zero,zero,zmm4[0,8],zero,zero,zero,zmm4[1,9],zero,zero,zero,zmm4[2,10],zero,zero,zero,zmm4[19,27],zero,zero,zero,zmm4[20,28],zero,zero,zero,zmm4[21,29],zero,zero,zero,zero,zero,zero,zmm4[35,39],zero,zero,zero,zmm4[40,44],zero,zero,zero,zmm4[41,45],zero,zero,zero,zmm4[50,54],zero,zero,zero,zmm4[51,55],zero,zero,zero,zmm4[56,60],zero,zero
1596; AVX512DQ-BW-FCP-NEXT:    vporq %zmm3, %zmm4, %zmm3
1597; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm4
1598; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10]
1599; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm4, %zmm5, %zmm4
1600; AVX512DQ-BW-FCP-NEXT:    movabsq $595056260442243600, %rax # imm = 0x842108421084210
1601; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
1602; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm3 {%k1}
1603; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15]
1604; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
1605; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,13],zero,zero,zero,xmm1[6,14],zero,zero,zero,xmm1[7,15],zero
1606; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3]
1607; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[5,13],zero,zero,zero,xmm0[6,14],zero,zero,zero,xmm0[7,15],zero,zero,zero
1608; AVX512DQ-BW-FCP-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm0 | xmm2 | xmm1
1609; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm0, 64(%r9)
1610; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, (%r9)
1611; AVX512DQ-BW-FCP-NEXT:    vzeroupper
1612; AVX512DQ-BW-FCP-NEXT:    retq
1613  %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64
1614  %in.vec1 = load <16 x i8>, ptr %in.vecptr1, align 64
1615  %in.vec2 = load <16 x i8>, ptr %in.vecptr2, align 64
1616  %in.vec3 = load <16 x i8>, ptr %in.vecptr3, align 64
1617  %in.vec4 = load <16 x i8>, ptr %in.vecptr4, align 64
1618  %1 = shufflevector <16 x i8> %in.vec0, <16 x i8> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1619  %2 = shufflevector <16 x i8> %in.vec2, <16 x i8> %in.vec3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1620  %3 = shufflevector <32 x i8> %1, <32 x i8> %2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1621  %4 = shufflevector <16 x i8> %in.vec4, <16 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1622  %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <80 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79>
1623  %interleaved.vec = shufflevector <80 x i8> %5, <80 x i8> poison, <80 x i32> <i32 0, i32 16, i32 32, i32 48, i32 64, i32 1, i32 17, i32 33, i32 49, i32 65, i32 2, i32 18, i32 34, i32 50, i32 66, i32 3, i32 19, i32 35, i32 51, i32 67, i32 4, i32 20, i32 36, i32 52, i32 68, i32 5, i32 21, i32 37, i32 53, i32 69, i32 6, i32 22, i32 38, i32 54, i32 70, i32 7, i32 23, i32 39, i32 55, i32 71, i32 8, i32 24, i32 40, i32 56, i32 72, i32 9, i32 25, i32 41, i32 57, i32 73, i32 10, i32 26, i32 42, i32 58, i32 74, i32 11, i32 27, i32 43, i32 59, i32 75, i32 12, i32 28, i32 44, i32 60, i32 76, i32 13, i32 29, i32 45, i32 61, i32 77, i32 14, i32 30, i32 46, i32 62, i32 78, i32 15, i32 31, i32 47, i32 63, i32 79>
1624  store <80 x i8> %interleaved.vec, ptr %out.vec, align 64
1625  ret void
1626}
1627
1628define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind {
1629; SSE-LABEL: store_i8_stride5_vf32:
1630; SSE:       # %bb.0:
1631; SSE-NEXT:    subq $152, %rsp
1632; SSE-NEXT:    movdqa 16(%rdi), %xmm15
1633; SSE-NEXT:    movdqa (%rsi), %xmm9
1634; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1635; SSE-NEXT:    movdqa 16(%rsi), %xmm7
1636; SSE-NEXT:    movdqa (%rdx), %xmm2
1637; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1638; SSE-NEXT:    movdqa 16(%rdx), %xmm0
1639; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1640; SSE-NEXT:    movdqa (%rcx), %xmm11
1641; SSE-NEXT:    movdqa 16(%rcx), %xmm12
1642; SSE-NEXT:    movdqa 16(%r8), %xmm14
1643; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
1644; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
1645; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
1646; SSE-NEXT:    pand %xmm13, %xmm0
1647; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[2,1,2,3]
1648; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1649; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,0,3]
1650; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
1651; SSE-NEXT:    movdqa %xmm13, %xmm4
1652; SSE-NEXT:    pandn %xmm1, %xmm4
1653; SSE-NEXT:    por %xmm0, %xmm4
1654; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255]
1655; SSE-NEXT:    pand %xmm8, %xmm4
1656; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[1,1,2,2]
1657; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
1658; SSE-NEXT:    pand %xmm3, %xmm0
1659; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[0,1,2,1]
1660; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
1661; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1662; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,0,3,4,5,6,7]
1663; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,0]
1664; SSE-NEXT:    movdqa %xmm3, %xmm5
1665; SSE-NEXT:    pandn %xmm1, %xmm5
1666; SSE-NEXT:    por %xmm0, %xmm5
1667; SSE-NEXT:    movdqa %xmm8, %xmm0
1668; SSE-NEXT:    pandn %xmm5, %xmm0
1669; SSE-NEXT:    por %xmm4, %xmm0
1670; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
1671; SSE-NEXT:    pand %xmm10, %xmm0
1672; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm14[1,1,2,2]
1673; SSE-NEXT:    movdqa %xmm10, %xmm4
1674; SSE-NEXT:    pandn %xmm1, %xmm4
1675; SSE-NEXT:    por %xmm0, %xmm4
1676; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1677; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7]
1678; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
1679; SSE-NEXT:    pand %xmm13, %xmm0
1680; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[2,1,2,3]
1681; SSE-NEXT:    movdqa %xmm11, %xmm2
1682; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1683; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,0,3]
1684; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
1685; SSE-NEXT:    movdqa %xmm13, %xmm4
1686; SSE-NEXT:    pandn %xmm1, %xmm4
1687; SSE-NEXT:    por %xmm0, %xmm4
1688; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[0,1,2,1]
1689; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
1690; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1691; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,0,3,4,5,6,7]
1692; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
1693; SSE-NEXT:    movdqa %xmm3, %xmm1
1694; SSE-NEXT:    pandn %xmm0, %xmm1
1695; SSE-NEXT:    movdqa (%rdi), %xmm0
1696; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1697; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
1698; SSE-NEXT:    pand %xmm3, %xmm0
1699; SSE-NEXT:    por %xmm0, %xmm1
1700; SSE-NEXT:    movdqa %xmm8, %xmm0
1701; SSE-NEXT:    pandn %xmm1, %xmm0
1702; SSE-NEXT:    pand %xmm8, %xmm4
1703; SSE-NEXT:    por %xmm4, %xmm0
1704; SSE-NEXT:    movdqa (%r8), %xmm1
1705; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1706; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
1707; SSE-NEXT:    movdqa %xmm10, %xmm4
1708; SSE-NEXT:    pandn %xmm1, %xmm4
1709; SSE-NEXT:    pand %xmm10, %xmm0
1710; SSE-NEXT:    por %xmm0, %xmm4
1711; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1712; SSE-NEXT:    movdqa %xmm7, %xmm11
1713; SSE-NEXT:    movdqa %xmm7, %xmm0
1714; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15]
1715; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1716; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
1717; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
1718; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,4,7]
1719; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
1720; SSE-NEXT:    movdqa %xmm9, %xmm4
1721; SSE-NEXT:    pandn %xmm0, %xmm4
1722; SSE-NEXT:    movdqa %xmm15, %xmm6
1723; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1724; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3]
1725; SSE-NEXT:    pand %xmm9, %xmm0
1726; SSE-NEXT:    por %xmm0, %xmm4
1727; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255]
1728; SSE-NEXT:    movdqa %xmm8, %xmm0
1729; SSE-NEXT:    pandn %xmm4, %xmm0
1730; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
1731; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,5,6,6,7]
1732; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2]
1733; SSE-NEXT:    movdqa %xmm3, %xmm5
1734; SSE-NEXT:    pandn %xmm4, %xmm5
1735; SSE-NEXT:    movdqa %xmm12, %xmm15
1736; SSE-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm12[8],xmm15[9],xmm12[9],xmm15[10],xmm12[10],xmm15[11],xmm12[11],xmm15[12],xmm12[12],xmm15[13],xmm12[13],xmm15[14],xmm12[14],xmm15[15],xmm12[15]
1737; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm15[0,1,2,1]
1738; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,1,2,3,4,5,6,7]
1739; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,4]
1740; SSE-NEXT:    pand %xmm3, %xmm4
1741; SSE-NEXT:    por %xmm5, %xmm4
1742; SSE-NEXT:    pand %xmm8, %xmm4
1743; SSE-NEXT:    por %xmm0, %xmm4
1744; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1745; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
1746; SSE-NEXT:    movdqa %xmm13, %xmm1
1747; SSE-NEXT:    pandn %xmm0, %xmm1
1748; SSE-NEXT:    pand %xmm13, %xmm4
1749; SSE-NEXT:    por %xmm4, %xmm1
1750; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1751; SSE-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1752; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1753; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[2,1,2,3]
1754; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,0,4,5,6,7]
1755; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
1756; SSE-NEXT:    movdqa %xmm10, %xmm4
1757; SSE-NEXT:    pandn %xmm0, %xmm4
1758; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm7[1,2,2,3,4,5,6,7]
1759; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1760; SSE-NEXT:    pand %xmm10, %xmm0
1761; SSE-NEXT:    por %xmm0, %xmm4
1762; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
1763; SSE-NEXT:    movdqa %xmm1, %xmm5
1764; SSE-NEXT:    pandn %xmm4, %xmm5
1765; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1]
1766; SSE-NEXT:    movdqa %xmm3, %xmm7
1767; SSE-NEXT:    pandn %xmm4, %xmm7
1768; SSE-NEXT:    movdqa %xmm11, %xmm0
1769; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
1770; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
1771; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,1,2,3]
1772; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,1,0,3,4,5,6,7]
1773; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6]
1774; SSE-NEXT:    pand %xmm3, %xmm4
1775; SSE-NEXT:    por %xmm7, %xmm4
1776; SSE-NEXT:    pand %xmm1, %xmm4
1777; SSE-NEXT:    por %xmm5, %xmm4
1778; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm14[0,0,1,1]
1779; SSE-NEXT:    movdqa %xmm9, %xmm0
1780; SSE-NEXT:    pandn %xmm5, %xmm0
1781; SSE-NEXT:    pand %xmm9, %xmm4
1782; SSE-NEXT:    por %xmm4, %xmm0
1783; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1784; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
1785; SSE-NEXT:    movdqa %xmm11, %xmm0
1786; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15]
1787; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1788; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
1789; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7]
1790; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,4,7]
1791; SSE-NEXT:    movdqa %xmm9, %xmm5
1792; SSE-NEXT:    pandn %xmm4, %xmm5
1793; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1794; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[2,2,3,3]
1795; SSE-NEXT:    pand %xmm9, %xmm4
1796; SSE-NEXT:    por %xmm4, %xmm5
1797; SSE-NEXT:    movdqa %xmm8, %xmm4
1798; SSE-NEXT:    pandn %xmm5, %xmm4
1799; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1800; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,5,6,6,7]
1801; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2]
1802; SSE-NEXT:    movdqa %xmm3, %xmm7
1803; SSE-NEXT:    pandn %xmm5, %xmm7
1804; SSE-NEXT:    movdqa %xmm2, %xmm12
1805; SSE-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm2[8],xmm12[9],xmm2[9],xmm12[10],xmm2[10],xmm12[11],xmm2[11],xmm12[12],xmm2[12],xmm12[13],xmm2[13],xmm12[14],xmm2[14],xmm12[15],xmm2[15]
1806; SSE-NEXT:    pshufd {{.*#+}} xmm14 = xmm12[0,1,2,1]
1807; SSE-NEXT:    pshuflw {{.*#+}} xmm14 = xmm14[1,1,2,3,4,5,6,7]
1808; SSE-NEXT:    pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,4]
1809; SSE-NEXT:    pand %xmm3, %xmm14
1810; SSE-NEXT:    por %xmm7, %xmm14
1811; SSE-NEXT:    pand %xmm8, %xmm14
1812; SSE-NEXT:    por %xmm4, %xmm14
1813; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1814; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[2,2,2,2]
1815; SSE-NEXT:    movdqa %xmm13, %xmm7
1816; SSE-NEXT:    pandn %xmm4, %xmm7
1817; SSE-NEXT:    pand %xmm13, %xmm14
1818; SSE-NEXT:    por %xmm14, %xmm7
1819; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1820; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1821; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1822; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,1,2,3]
1823; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,0,4,5,6,7]
1824; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
1825; SSE-NEXT:    movdqa %xmm10, %xmm14
1826; SSE-NEXT:    pandn %xmm4, %xmm14
1827; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[1,2,2,3,4,5,6,7]
1828; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
1829; SSE-NEXT:    pand %xmm10, %xmm4
1830; SSE-NEXT:    por %xmm4, %xmm14
1831; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1]
1832; SSE-NEXT:    movdqa %xmm3, %xmm2
1833; SSE-NEXT:    pandn %xmm4, %xmm2
1834; SSE-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1835; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1836; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm11[2,1,2,3]
1837; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,1,0,3,4,5,6,7]
1838; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6]
1839; SSE-NEXT:    pand %xmm3, %xmm4
1840; SSE-NEXT:    por %xmm2, %xmm4
1841; SSE-NEXT:    pand %xmm1, %xmm4
1842; SSE-NEXT:    pandn %xmm14, %xmm1
1843; SSE-NEXT:    por %xmm4, %xmm1
1844; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[0,0,1,1]
1845; SSE-NEXT:    movdqa %xmm9, %xmm7
1846; SSE-NEXT:    pandn %xmm2, %xmm7
1847; SSE-NEXT:    pand %xmm9, %xmm1
1848; SSE-NEXT:    por %xmm1, %xmm7
1849; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,5,7,6,7]
1850; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,3,2]
1851; SSE-NEXT:    movdqa %xmm3, %xmm2
1852; SSE-NEXT:    pandn %xmm0, %xmm2
1853; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1854; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,6,6,7]
1855; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
1856; SSE-NEXT:    pand %xmm3, %xmm0
1857; SSE-NEXT:    por %xmm0, %xmm2
1858; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255]
1859; SSE-NEXT:    movdqa %xmm6, %xmm0
1860; SSE-NEXT:    pandn %xmm2, %xmm0
1861; SSE-NEXT:    pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1862; SSE-NEXT:    # xmm2 = mem[0,1,2,3,7,5,6,6]
1863; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,2]
1864; SSE-NEXT:    movdqa %xmm10, %xmm14
1865; SSE-NEXT:    pandn %xmm2, %xmm14
1866; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
1867; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[3,3,3,3]
1868; SSE-NEXT:    pand %xmm10, %xmm2
1869; SSE-NEXT:    por %xmm2, %xmm14
1870; SSE-NEXT:    pand %xmm6, %xmm14
1871; SSE-NEXT:    por %xmm0, %xmm14
1872; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
1873; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1874; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
1875; SSE-NEXT:    movdqa %xmm11, %xmm15
1876; SSE-NEXT:    pandn %xmm0, %xmm15
1877; SSE-NEXT:    pand %xmm11, %xmm14
1878; SSE-NEXT:    por %xmm14, %xmm15
1879; SSE-NEXT:    pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1880; SSE-NEXT:    # xmm0 = mem[1,0,2,3,4,5,6,7]
1881; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1882; SSE-NEXT:    movdqa %xmm9, %xmm2
1883; SSE-NEXT:    pandn %xmm0, %xmm2
1884; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm5[1,0,2,3,4,5,6,7]
1885; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1886; SSE-NEXT:    pand %xmm9, %xmm0
1887; SSE-NEXT:    por %xmm0, %xmm2
1888; SSE-NEXT:    movdqa %xmm8, %xmm0
1889; SSE-NEXT:    pandn %xmm2, %xmm0
1890; SSE-NEXT:    pshuflw $164, (%rsp), %xmm2 # 16-byte Folded Reload
1891; SSE-NEXT:    # xmm2 = mem[0,1,2,2,4,5,6,7]
1892; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,3]
1893; SSE-NEXT:    movdqa %xmm13, %xmm14
1894; SSE-NEXT:    pandn %xmm2, %xmm14
1895; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,0,0,0]
1896; SSE-NEXT:    pand %xmm13, %xmm2
1897; SSE-NEXT:    por %xmm2, %xmm14
1898; SSE-NEXT:    pand %xmm8, %xmm14
1899; SSE-NEXT:    por %xmm0, %xmm14
1900; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
1901; SSE-NEXT:    movdqa %xmm3, %xmm0
1902; SSE-NEXT:    pandn %xmm2, %xmm0
1903; SSE-NEXT:    pand %xmm3, %xmm14
1904; SSE-NEXT:    por %xmm14, %xmm0
1905; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,5,7,6,7]
1906; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,3,2]
1907; SSE-NEXT:    movdqa %xmm3, %xmm5
1908; SSE-NEXT:    pandn %xmm2, %xmm5
1909; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
1910; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,7,6,6,7]
1911; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2]
1912; SSE-NEXT:    pand %xmm3, %xmm2
1913; SSE-NEXT:    por %xmm2, %xmm5
1914; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1915; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm14[3,3,3,3]
1916; SSE-NEXT:    pand %xmm10, %xmm2
1917; SSE-NEXT:    pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
1918; SSE-NEXT:    # xmm12 = mem[0,1,2,3,7,5,6,6]
1919; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[2,3,2,2]
1920; SSE-NEXT:    pandn %xmm12, %xmm10
1921; SSE-NEXT:    por %xmm2, %xmm10
1922; SSE-NEXT:    movdqa %xmm6, %xmm1
1923; SSE-NEXT:    pand %xmm6, %xmm10
1924; SSE-NEXT:    pandn %xmm5, %xmm1
1925; SSE-NEXT:    por %xmm10, %xmm1
1926; SSE-NEXT:    pand %xmm11, %xmm1
1927; SSE-NEXT:    movdqa %xmm1, %xmm5
1928; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1929; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
1930; SSE-NEXT:    pandn %xmm2, %xmm11
1931; SSE-NEXT:    por %xmm5, %xmm11
1932; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm4[1,0,2,3,4,5,6,7]
1933; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1934; SSE-NEXT:    pand %xmm9, %xmm2
1935; SSE-NEXT:    pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
1936; SSE-NEXT:    # xmm5 = mem[1,0,2,3,4,5,6,7]
1937; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
1938; SSE-NEXT:    pandn %xmm5, %xmm9
1939; SSE-NEXT:    por %xmm2, %xmm9
1940; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm14[0,0,0,0]
1941; SSE-NEXT:    pand %xmm13, %xmm2
1942; SSE-NEXT:    pshuflw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
1943; SSE-NEXT:    # xmm5 = mem[0,1,2,2,4,5,6,7]
1944; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,0,1,3]
1945; SSE-NEXT:    pandn %xmm5, %xmm13
1946; SSE-NEXT:    por %xmm2, %xmm13
1947; SSE-NEXT:    pand %xmm8, %xmm13
1948; SSE-NEXT:    pandn %xmm9, %xmm8
1949; SSE-NEXT:    por %xmm13, %xmm8
1950; SSE-NEXT:    pand %xmm3, %xmm8
1951; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
1952; SSE-NEXT:    pandn %xmm2, %xmm3
1953; SSE-NEXT:    por %xmm8, %xmm3
1954; SSE-NEXT:    movdqa %xmm3, (%r9)
1955; SSE-NEXT:    movdqa %xmm11, 64(%r9)
1956; SSE-NEXT:    movdqa %xmm0, 80(%r9)
1957; SSE-NEXT:    movdqa %xmm15, 144(%r9)
1958; SSE-NEXT:    movdqa %xmm7, 16(%r9)
1959; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1960; SSE-NEXT:    movaps %xmm0, 48(%r9)
1961; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1962; SSE-NEXT:    movaps %xmm0, 96(%r9)
1963; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1964; SSE-NEXT:    movaps %xmm0, 128(%r9)
1965; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1966; SSE-NEXT:    movaps %xmm0, 32(%r9)
1967; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1968; SSE-NEXT:    movaps %xmm0, 112(%r9)
1969; SSE-NEXT:    addq $152, %rsp
1970; SSE-NEXT:    retq
1971;
1972; AVX-LABEL: store_i8_stride5_vf32:
1973; AVX:       # %bb.0:
1974; AVX-NEXT:    vmovdqa 16(%rsi), %xmm10
1975; AVX-NEXT:    vmovdqa 16(%rdi), %xmm11
1976; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
1977; AVX-NEXT:    vmovddup {{.*#+}} xmm5 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13]
1978; AVX-NEXT:    # xmm5 = mem[0,0]
1979; AVX-NEXT:    vpshufb %xmm5, %xmm0, %xmm1
1980; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u]
1981; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1982; AVX-NEXT:    vmovaps {{.*#+}} ymm7 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
1983; AVX-NEXT:    vandnps %ymm0, %ymm7, %ymm2
1984; AVX-NEXT:    vmovdqa 16(%rcx), %xmm0
1985; AVX-NEXT:    vmovdqa 16(%rdx), %xmm1
1986; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1987; AVX-NEXT:    vmovddup {{.*#+}} xmm4 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0]
1988; AVX-NEXT:    # xmm4 = mem[0,0]
1989; AVX-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
1990; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
1991; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8]
1992; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm6, %ymm3
1993; AVX-NEXT:    vandps %ymm7, %ymm3, %ymm3
1994; AVX-NEXT:    vorps %ymm2, %ymm3, %ymm2
1995; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm3
1996; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,xmm3[1,2,3,4],zero,xmm3[6,7,8,9],zero,xmm3[11,12,13,14],zero
1997; AVX-NEXT:    vmovdqa 16(%r8), %xmm15
1998; AVX-NEXT:    vmovdqa {{.*#+}} xmm12 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15]
1999; AVX-NEXT:    vpshufb %xmm12, %xmm15, %xmm6
2000; AVX-NEXT:    vpor %xmm6, %xmm3, %xmm3
2001; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2002; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5],zero,xmm2[7,8,9,10],zero,xmm2[12,13,14,15]
2003; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,xmm15[9],zero,zero,zero,zero,xmm15[10],zero,zero,zero,zero,xmm15[11],zero,zero,zero,zero
2004; AVX-NEXT:    vpor %xmm3, %xmm2, %xmm2
2005; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2006; AVX-NEXT:    vmovdqa (%rcx), %xmm7
2007; AVX-NEXT:    vmovdqa (%rdx), %xmm8
2008; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
2009; AVX-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
2010; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2011; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm6[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u]
2012; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm14
2013; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
2014; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm2[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6]
2015; AVX-NEXT:    vmovdqa (%rsi), %xmm2
2016; AVX-NEXT:    vmovdqa (%rdi), %xmm3
2017; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
2018; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2019; AVX-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
2020; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm5, %ymm5
2021; AVX-NEXT:    vmovaps {{.*#+}} ymm13 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255]
2022; AVX-NEXT:    vandnps %ymm14, %ymm13, %ymm14
2023; AVX-NEXT:    vandps %ymm5, %ymm13, %ymm5
2024; AVX-NEXT:    vorps %ymm5, %ymm14, %ymm5
2025; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm14
2026; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[0,1,2,3],zero,xmm14[5,6,7,8],zero,xmm14[10,11,12,13],zero,xmm14[15]
2027; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm15[0],zero,zero,zero,zero,xmm15[1],zero,zero,zero,zero,xmm15[2],zero
2028; AVX-NEXT:    vpor %xmm9, %xmm14, %xmm4
2029; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2030; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,xmm5[1,2,3,4],zero,xmm5[6,7,8,9],zero,xmm5[11,12,13,14],zero
2031; AVX-NEXT:    vmovdqa (%r8), %xmm14
2032; AVX-NEXT:    vpshufb %xmm12, %xmm14, %xmm9
2033; AVX-NEXT:    vpor %xmm5, %xmm9, %xmm4
2034; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2035; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[6,u,u,u],zero,xmm0[7,u,u,u],zero,xmm0[8,u,u,u],zero
2036; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6],zero,xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9]
2037; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm1
2038; AVX-NEXT:    vmovddup {{.*#+}} xmm9 = [0,6,7,10,11,0,8,9,0,6,7,10,11,0,8,9]
2039; AVX-NEXT:    # xmm9 = mem[0,0]
2040; AVX-NEXT:    vpshufb %xmm9, %xmm6, %xmm5
2041; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm5, %ymm1
2042; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm10[u,u,u],zero,xmm10[7,u,u,u],zero,xmm10[8,u,u,u],zero,xmm10[9,u]
2043; AVX-NEXT:    vmovddup {{.*#+}} xmm6 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0]
2044; AVX-NEXT:    # xmm6 = mem[0,0]
2045; AVX-NEXT:    vpshufb %xmm6, %xmm11, %xmm12
2046; AVX-NEXT:    vpor %xmm5, %xmm12, %xmm5
2047; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
2048; AVX-NEXT:    vmovddup {{.*#+}} xmm12 = [6,11,10,0,9,8,13,12,6,11,10,0,9,8,13,12]
2049; AVX-NEXT:    # xmm12 = mem[0,0]
2050; AVX-NEXT:    vpshufb %xmm12, %xmm10, %xmm10
2051; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm10, %ymm5
2052; AVX-NEXT:    vmovaps {{.*#+}} ymm10 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
2053; AVX-NEXT:    vandnps %ymm1, %ymm10, %ymm1
2054; AVX-NEXT:    vandps %ymm5, %ymm10, %ymm5
2055; AVX-NEXT:    vorps %ymm1, %ymm5, %ymm5
2056; AVX-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,128,4,5,6,7,128,9,10,11,12,128,14,15]
2057; AVX-NEXT:    vpshufb %xmm10, %xmm5, %xmm1
2058; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,3,128,128,128,128,4,128,128,128,128,5,128,128]
2059; AVX-NEXT:    vpshufb %xmm4, %xmm15, %xmm11
2060; AVX-NEXT:    vpor %xmm1, %xmm11, %xmm1
2061; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm5
2062; AVX-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,128,3,4,5,6,128,8,9,10,11,128,13,14,15]
2063; AVX-NEXT:    vpshufb %xmm11, %xmm5, %xmm0
2064; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,6,128,128,128,128,7,128,128,128,128,8,128,128,128]
2065; AVX-NEXT:    vpshufb %xmm5, %xmm15, %xmm15
2066; AVX-NEXT:    vpor %xmm0, %xmm15, %xmm15
2067; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
2068; AVX-NEXT:    vpshufb %xmm9, %xmm0, %xmm9
2069; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u]
2070; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm0
2071; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
2072; AVX-NEXT:    vpshufb %xmm12, %xmm9, %xmm9
2073; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2074; AVX-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6]
2075; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm12, %ymm9
2076; AVX-NEXT:    vmovaps {{.*#+}} ymm12 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
2077; AVX-NEXT:    vandnps %ymm0, %ymm12, %ymm0
2078; AVX-NEXT:    vandps %ymm12, %ymm9, %ymm9
2079; AVX-NEXT:    vorps %ymm0, %ymm9, %ymm9
2080; AVX-NEXT:    vextractf128 $1, %ymm9, %xmm0
2081; AVX-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
2082; AVX-NEXT:    vpshufb %xmm4, %xmm14, %xmm4
2083; AVX-NEXT:    vpor %xmm4, %xmm0, %xmm0
2084; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm9[0,1,2,3],zero,xmm9[5,6,7,8],zero,xmm9[10,11,12,13],zero,xmm9[15]
2085; AVX-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm14[0],zero,zero,zero,zero,xmm14[1],zero,zero,zero,zero,xmm14[2],zero
2086; AVX-NEXT:    vpor %xmm4, %xmm9, %xmm4
2087; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,xmm2[7,u,u,u],zero,xmm2[8,u,u,u],zero,xmm2[9,u]
2088; AVX-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
2089; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
2090; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2091; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u]
2092; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
2093; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,xmm7[6,u,u,u],zero,xmm7[7,u,u,u],zero,xmm7[8,u,u,u],zero
2094; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm8[6],zero,xmm8[u,u,u,7],zero,xmm8[u,u,u,8],zero,xmm8[u,u,u,9]
2095; AVX-NEXT:    vpor %xmm3, %xmm6, %xmm3
2096; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15]
2097; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8]
2098; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm3, %ymm3
2099; AVX-NEXT:    vandnps %ymm2, %ymm13, %ymm2
2100; AVX-NEXT:    vandps %ymm3, %ymm13, %ymm3
2101; AVX-NEXT:    vorps %ymm2, %ymm3, %ymm2
2102; AVX-NEXT:    vpshufb %xmm11, %xmm2, %xmm3
2103; AVX-NEXT:    vpshufb %xmm5, %xmm14, %xmm5
2104; AVX-NEXT:    vpor %xmm5, %xmm3, %xmm3
2105; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
2106; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5],zero,xmm2[7,8,9,10],zero,xmm2[12,13,14,15]
2107; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,xmm14[9],zero,zero,zero,zero,xmm14[10],zero,zero,zero,zero,xmm14[11],zero,zero,zero,zero
2108; AVX-NEXT:    vpor %xmm5, %xmm2, %xmm2
2109; AVX-NEXT:    vmovdqa %xmm2, 48(%r9)
2110; AVX-NEXT:    vmovdqa %xmm3, 32(%r9)
2111; AVX-NEXT:    vmovdqa %xmm4, (%r9)
2112; AVX-NEXT:    vmovdqa %xmm0, 16(%r9)
2113; AVX-NEXT:    vmovdqa %xmm15, 112(%r9)
2114; AVX-NEXT:    vmovdqa %xmm1, 96(%r9)
2115; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2116; AVX-NEXT:    vmovaps %xmm0, 64(%r9)
2117; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2118; AVX-NEXT:    vmovaps %xmm0, 80(%r9)
2119; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2120; AVX-NEXT:    vmovaps %xmm0, 128(%r9)
2121; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2122; AVX-NEXT:    vmovaps %xmm0, 144(%r9)
2123; AVX-NEXT:    vzeroupper
2124; AVX-NEXT:    retq
2125;
2126; AVX2-LABEL: store_i8_stride5_vf32:
2127; AVX2:       # %bb.0:
2128; AVX2-NEXT:    vmovdqa (%rdi), %ymm3
2129; AVX2-NEXT:    vmovdqa (%rsi), %ymm4
2130; AVX2-NEXT:    vmovdqa (%rdx), %ymm1
2131; AVX2-NEXT:    vmovdqa (%rcx), %ymm2
2132; AVX2-NEXT:    vmovdqa (%r8), %ymm0
2133; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,27,u,u,26,u,28,u,30,u,u,29,u,31,u]
2134; AVX2-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
2135; AVX2-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[2,2,3,3,6,6,7,7]
2136; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0,255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0]
2137; AVX2-NEXT:    # ymm7 = mem[0,1,0,1]
2138; AVX2-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
2139; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3]
2140; AVX2-NEXT:    vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30]
2141; AVX2-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm3[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
2142; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[2,2,3,3,6,6,7,7]
2143; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255,255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255]
2144; AVX2-NEXT:    # ymm8 = mem[0,1,0,1]
2145; AVX2-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
2146; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3]
2147; AVX2-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u]
2148; AVX2-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
2149; AVX2-NEXT:    vpshufd {{.*#+}} ymm6 = ymm0[2,2,3,3,6,6,7,7]
2150; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3]
2151; AVX2-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
2152; AVX2-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
2153; AVX2-NEXT:    vmovdqa (%rsi), %xmm7
2154; AVX2-NEXT:    vmovdqa (%rdi), %xmm8
2155; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
2156; AVX2-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
2157; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
2158; AVX2-NEXT:    vmovdqa (%rdx), %xmm9
2159; AVX2-NEXT:    vmovdqa (%rcx), %xmm10
2160; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
2161; AVX2-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
2162; AVX2-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,1,1]
2163; AVX2-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255]
2164; AVX2-NEXT:    vpblendvb %ymm12, %ymm6, %ymm11, %ymm6
2165; AVX2-NEXT:    vmovdqa (%r8), %xmm11
2166; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm11[0,0,1,1]
2167; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1]
2168; AVX2-NEXT:    vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
2169; AVX2-NEXT:    vpblendvb %ymm13, %ymm6, %ymm12, %ymm6
2170; AVX2-NEXT:    vpshufb {{.*#+}} xmm10 = zero,xmm10[6],zero,xmm10[8,u],zero,xmm10[7],zero,xmm10[9],zero,xmm10[11,u],zero,xmm10[10],zero,xmm10[12]
2171; AVX2-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[6],zero,xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[11],zero,xmm9[u,10],zero,xmm9[12],zero
2172; AVX2-NEXT:    vpor %xmm10, %xmm9, %xmm9
2173; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1]
2174; AVX2-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[8],zero,xmm8[u,7],zero,xmm8[9],zero,xmm8[u],zero,xmm8[u,10],zero,xmm8[12],zero,xmm8[u,11]
2175; AVX2-NEXT:    vpshufb {{.*#+}} xmm7 = zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9,u,11,u],zero,xmm7[10],zero,xmm7[12,u],zero
2176; AVX2-NEXT:    vpor %xmm7, %xmm8, %xmm7
2177; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1]
2178; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
2179; AVX2-NEXT:    vpblendvb %ymm8, %ymm9, %ymm7, %ymm7
2180; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm11[1,1,2,2]
2181; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,1,1]
2182; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
2183; AVX2-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
2184; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,ymm3[19,20],zero,ymm3[22],zero,ymm3[24],zero,ymm3[22,23],zero,ymm3[25],zero,ymm3[23]
2185; AVX2-NEXT:    vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero
2186; AVX2-NEXT:    vpor %ymm8, %ymm9, %ymm8
2187; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
2188; AVX2-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero
2189; AVX2-NEXT:    vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25]
2190; AVX2-NEXT:    vpor %ymm9, %ymm10, %ymm9
2191; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
2192; AVX2-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0]
2193; AVX2-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
2194; AVX2-NEXT:    vpshufd {{.*#+}} ymm9 = ymm0[0,2,1,1,4,6,5,5]
2195; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2]
2196; AVX2-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
2197; AVX2-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
2198; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4]
2199; AVX2-NEXT:    vpermd %ymm3, %ymm9, %ymm3
2200; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u]
2201; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255]
2202; AVX2-NEXT:    vpblendvb %ymm9, %ymm3, %ymm4, %ymm3
2203; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero
2204; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,ymm1[18],zero,zero,zero
2205; AVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
2206; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255]
2207; AVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
2208; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [3,3,3,3,0,4,4,4]
2209; AVX2-NEXT:    vpermd %ymm0, %ymm2, %ymm0
2210; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
2211; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
2212; AVX2-NEXT:    vmovdqa %ymm0, 64(%r9)
2213; AVX2-NEXT:    vmovdqa %ymm8, 96(%r9)
2214; AVX2-NEXT:    vmovdqa %ymm5, 128(%r9)
2215; AVX2-NEXT:    vmovdqa %ymm7, 32(%r9)
2216; AVX2-NEXT:    vmovdqa %ymm6, (%r9)
2217; AVX2-NEXT:    vzeroupper
2218; AVX2-NEXT:    retq
2219;
2220; AVX2-FP-LABEL: store_i8_stride5_vf32:
2221; AVX2-FP:       # %bb.0:
2222; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm3
2223; AVX2-FP-NEXT:    vmovdqa (%rsi), %ymm4
2224; AVX2-FP-NEXT:    vmovdqa (%rdx), %ymm1
2225; AVX2-FP-NEXT:    vmovdqa (%rcx), %ymm2
2226; AVX2-FP-NEXT:    vmovdqa (%r8), %ymm0
2227; AVX2-FP-NEXT:    vmovdqa (%rsi), %xmm6
2228; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm7
2229; AVX2-FP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
2230; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
2231; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
2232; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm8
2233; AVX2-FP-NEXT:    vmovdqa (%rcx), %xmm9
2234; AVX2-FP-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
2235; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
2236; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1]
2237; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255]
2238; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm5, %ymm10, %ymm5
2239; AVX2-FP-NEXT:    vmovdqa (%r8), %xmm10
2240; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1]
2241; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1]
2242; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
2243; AVX2-FP-NEXT:    vpblendvb %ymm12, %ymm5, %ymm11, %ymm5
2244; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,xmm9[6],zero,xmm9[8,u],zero,xmm9[7],zero,xmm9[9],zero,xmm9[11,u],zero,xmm9[10],zero,xmm9[12]
2245; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[6],zero,xmm8[8],zero,xmm8[u,7],zero,xmm8[9],zero,xmm8[11],zero,xmm8[u,10],zero,xmm8[12],zero
2246; AVX2-FP-NEXT:    vpor %xmm9, %xmm8, %xmm8
2247; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
2248; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[8],zero,xmm7[u,7],zero,xmm7[9],zero,xmm7[u],zero,xmm7[u,10],zero,xmm7[12],zero,xmm7[u,11]
2249; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero
2250; AVX2-FP-NEXT:    vpor %xmm7, %xmm6, %xmm6
2251; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
2252; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
2253; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm8, %ymm6, %ymm6
2254; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm10[1,1,2,2]
2255; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,1,1]
2256; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
2257; AVX2-FP-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
2258; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,ymm3[19,20],zero,ymm3[22],zero,ymm3[24],zero,ymm3[22,23],zero,ymm3[25],zero,ymm3[23]
2259; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero
2260; AVX2-FP-NEXT:    vpor %ymm7, %ymm8, %ymm7
2261; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
2262; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero
2263; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25]
2264; AVX2-FP-NEXT:    vpor %ymm8, %ymm9, %ymm8
2265; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
2266; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0]
2267; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
2268; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm0[0,2,1,1,4,6,5,5]
2269; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2]
2270; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
2271; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
2272; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,ymm1[29,26],zero,ymm1[28],zero,ymm1[30],zero,ymm1[28,29],zero,ymm1[31],zero,ymm1[29]
2273; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero
2274; AVX2-FP-NEXT:    vpor %ymm8, %ymm9, %ymm8
2275; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
2276; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[29,26],zero,ymm3[28],zero,ymm3[26,27,28,29],zero,ymm3[31],zero,ymm3[29,30],zero
2277; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,zero,zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30]
2278; AVX2-FP-NEXT:    vpor %ymm9, %ymm10, %ymm9
2279; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
2280; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u]
2281; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
2282; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm0[2,2,3,3,6,6,7,7]
2283; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
2284; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
2285; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
2286; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4]
2287; AVX2-FP-NEXT:    vpermd %ymm3, %ymm9, %ymm3
2288; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u]
2289; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255]
2290; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm3, %ymm4, %ymm3
2291; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero
2292; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,ymm1[18],zero,zero,zero
2293; AVX2-FP-NEXT:    vpor %ymm2, %ymm1, %ymm1
2294; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255]
2295; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
2296; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [3,3,3,3,0,4,4,4]
2297; AVX2-FP-NEXT:    vpermd %ymm0, %ymm2, %ymm0
2298; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
2299; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
2300; AVX2-FP-NEXT:    vmovdqa %ymm0, 64(%r9)
2301; AVX2-FP-NEXT:    vmovdqa %ymm8, 128(%r9)
2302; AVX2-FP-NEXT:    vmovdqa %ymm7, 96(%r9)
2303; AVX2-FP-NEXT:    vmovdqa %ymm6, 32(%r9)
2304; AVX2-FP-NEXT:    vmovdqa %ymm5, (%r9)
2305; AVX2-FP-NEXT:    vzeroupper
2306; AVX2-FP-NEXT:    retq
2307;
2308; AVX2-FCP-LABEL: store_i8_stride5_vf32:
2309; AVX2-FCP:       # %bb.0:
2310; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm3
2311; AVX2-FCP-NEXT:    vmovdqa (%rsi), %ymm4
2312; AVX2-FCP-NEXT:    vmovdqa (%rdx), %ymm1
2313; AVX2-FCP-NEXT:    vmovdqa (%rcx), %ymm2
2314; AVX2-FCP-NEXT:    vmovdqa (%r8), %ymm0
2315; AVX2-FCP-NEXT:    vmovdqa (%rsi), %xmm6
2316; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm7
2317; AVX2-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
2318; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
2319; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
2320; AVX2-FCP-NEXT:    vmovdqa (%rdx), %xmm8
2321; AVX2-FCP-NEXT:    vmovdqa (%rcx), %xmm9
2322; AVX2-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
2323; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
2324; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1]
2325; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255]
2326; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm5, %ymm10, %ymm5
2327; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,0,0,1,1]
2328; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm10, %ymm10
2329; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
2330; AVX2-FCP-NEXT:    vpblendvb %ymm11, %ymm5, %ymm10, %ymm5
2331; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,xmm9[6],zero,xmm9[8,u],zero,xmm9[7],zero,xmm9[9],zero,xmm9[11,u],zero,xmm9[10],zero,xmm9[12]
2332; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[6],zero,xmm8[8],zero,xmm8[u,7],zero,xmm8[9],zero,xmm8[11],zero,xmm8[u,10],zero,xmm8[12],zero
2333; AVX2-FCP-NEXT:    vpor %xmm9, %xmm8, %xmm8
2334; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
2335; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[8],zero,xmm7[u,7],zero,xmm7[9],zero,xmm7[u],zero,xmm7[u,10],zero,xmm7[12],zero,xmm7[u,11]
2336; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero
2337; AVX2-FCP-NEXT:    vpor %xmm7, %xmm6, %xmm6
2338; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
2339; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
2340; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm8, %ymm6, %ymm6
2341; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,1,2,2,2,2,2,2]
2342; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm7, %ymm7
2343; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
2344; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
2345; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,ymm3[19,20],zero,ymm3[22],zero,ymm3[24],zero,ymm3[22,23],zero,ymm3[25],zero,ymm3[23]
2346; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero
2347; AVX2-FCP-NEXT:    vpor %ymm7, %ymm8, %ymm7
2348; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
2349; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero
2350; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25]
2351; AVX2-FCP-NEXT:    vpor %ymm8, %ymm9, %ymm8
2352; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
2353; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0]
2354; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
2355; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [4,6,5,5,5,5,4,6]
2356; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm8, %ymm8
2357; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
2358; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
2359; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,ymm1[29,26],zero,ymm1[28],zero,ymm1[30],zero,ymm1[28,29],zero,ymm1[31],zero,ymm1[29]
2360; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero
2361; AVX2-FCP-NEXT:    vpor %ymm8, %ymm9, %ymm8
2362; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
2363; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[29,26],zero,ymm3[28],zero,ymm3[26,27,28,29],zero,ymm3[31],zero,ymm3[29,30],zero
2364; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,zero,zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30]
2365; AVX2-FCP-NEXT:    vpor %ymm9, %ymm10, %ymm9
2366; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
2367; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u]
2368; AVX2-FCP-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
2369; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [6,6,6,6,7,7,7,7]
2370; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm9, %ymm9
2371; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
2372; AVX2-FCP-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
2373; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4]
2374; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm9, %ymm3
2375; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u]
2376; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255]
2377; AVX2-FCP-NEXT:    vpblendvb %ymm9, %ymm3, %ymm4, %ymm3
2378; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero
2379; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,ymm1[18],zero,zero,zero
2380; AVX2-FCP-NEXT:    vpor %ymm2, %ymm1, %ymm1
2381; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255]
2382; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
2383; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [3,3,3,3,0,4,4,4]
2384; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm2, %ymm0
2385; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
2386; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
2387; AVX2-FCP-NEXT:    vmovdqa %ymm0, 64(%r9)
2388; AVX2-FCP-NEXT:    vmovdqa %ymm8, 128(%r9)
2389; AVX2-FCP-NEXT:    vmovdqa %ymm7, 96(%r9)
2390; AVX2-FCP-NEXT:    vmovdqa %ymm6, 32(%r9)
2391; AVX2-FCP-NEXT:    vmovdqa %ymm5, (%r9)
2392; AVX2-FCP-NEXT:    vzeroupper
2393; AVX2-FCP-NEXT:    retq
2394;
2395; AVX512-LABEL: store_i8_stride5_vf32:
2396; AVX512:       # %bb.0:
2397; AVX512-NEXT:    vmovdqa (%rdi), %ymm3
2398; AVX512-NEXT:    vmovdqa (%rsi), %ymm4
2399; AVX512-NEXT:    vmovdqa (%rdx), %ymm1
2400; AVX512-NEXT:    vmovdqa (%rcx), %ymm2
2401; AVX512-NEXT:    vmovdqa (%r8), %ymm0
2402; AVX512-NEXT:    vmovdqa (%rdi), %xmm5
2403; AVX512-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[8],zero,xmm5[u,7],zero,xmm5[9],zero,xmm5[u],zero,xmm5[u,10],zero,xmm5[12],zero,xmm5[u,11]
2404; AVX512-NEXT:    vmovdqa (%rsi), %xmm7
2405; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9,u,11,u],zero,xmm7[10],zero,xmm7[12,u],zero
2406; AVX512-NEXT:    vpor %xmm6, %xmm8, %xmm6
2407; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
2408; AVX512-NEXT:    vmovdqa (%rcx), %xmm8
2409; AVX512-NEXT:    vpshufb {{.*#+}} xmm9 = zero,xmm8[6],zero,xmm8[8,u],zero,xmm8[7],zero,xmm8[9],zero,xmm8[11,u],zero,xmm8[10],zero,xmm8[12]
2410; AVX512-NEXT:    vmovdqa (%rdx), %xmm10
2411; AVX512-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[6],zero,xmm10[8],zero,xmm10[u,7],zero,xmm10[9],zero,xmm10[11],zero,xmm10[u,10],zero,xmm10[12],zero
2412; AVX512-NEXT:    vpor %xmm9, %xmm11, %xmm9
2413; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1]
2414; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255]
2415; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm6 ^ (ymm11 & (ymm9 ^ ymm6))
2416; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm6
2417; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7]
2418; AVX512-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
2419; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
2420; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
2421; AVX512-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
2422; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
2423; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
2424; AVX512-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm8 ^ (ymm7 & (ymm5 ^ ymm8))
2425; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
2426; AVX512-NEXT:    vmovdqa (%r8), %xmm6
2427; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0]
2428; AVX512-NEXT:    vpermd %zmm6, %zmm8, %zmm6
2429; AVX512-NEXT:    vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
2430; AVX512-NEXT:    vpshufb {{.*#+}} ymm5 = ymm2[u,u,u],zero,ymm2[13,u,u,u],zero,ymm2[14,u,u,u],zero,ymm2[15,u,u,u],zero,ymm2[16,u,u,u],zero,ymm2[17,u,u,u],zero,ymm2[18,u,u]
2431; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,13],zero,ymm1[u,u,u,14],zero,ymm1[u,u,u,15],zero,ymm1[u,u,u,16],zero,ymm1[u,u,u,17],zero,ymm1[u,u,u,18],zero,ymm1[u,u]
2432; AVX512-NEXT:    vpor %ymm5, %ymm8, %ymm5
2433; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = ymm4[u],zero,ymm4[13,u,u,u],zero,ymm4[14,u,u,u],zero,ymm4[15,u,u,u],zero,ymm4[16,u,u,u],zero,ymm4[17,u,u,u],zero,ymm4[18,u,u,u],zero
2434; AVX512-NEXT:    vpshufb {{.*#+}} ymm9 = ymm3[u,13],zero,ymm3[u,u,u,14],zero,ymm3[u,u,u,15],zero,ymm3[u,u,u,16],zero,ymm3[u,u,u,17],zero,ymm3[u,u,u,18],zero,ymm3[u,u,u,19]
2435; AVX512-NEXT:    vpor %ymm8, %ymm9, %ymm8
2436; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm11 & (ymm8 ^ ymm5))
2437; AVX512-NEXT:    vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero,ymm3[25],zero,zero
2438; AVX512-NEXT:    vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21,u],zero,ymm4[20],zero,ymm4[22],zero,ymm4[24,u],zero,ymm4[23],zero,ymm4[25,u]
2439; AVX512-NEXT:    vpor %ymm5, %ymm9, %ymm5
2440; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3]
2441; AVX512-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero
2442; AVX512-NEXT:    vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm1[21],zero,ymm1[21,20],zero,ymm1[22],zero,ymm1[24],zero,ymm1[22,23],zero,ymm1[25]
2443; AVX512-NEXT:    vpor %ymm9, %ymm10, %ymm9
2444; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
2445; AVX512-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm5))
2446; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm5
2447; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7]
2448; AVX512-NEXT:    vpshufb {{.*#+}} ymm8 = ymm0[12],zero,zero,zero,zero,ymm0[13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,ymm0[18],zero
2449; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm0[0,2,1,1,4,6,5,5]
2450; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2]
2451; AVX512-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
2452; AVX512-NEXT:    vpandn %ymm9, %ymm10, %ymm9
2453; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
2454; AVX512-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm5 & mem)
2455; AVX512-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[26],zero,ymm3[28],zero,zero,ymm3[27],zero,ymm3[29],zero,ymm3[31],zero,zero,ymm3[30],zero
2456; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm4[26],zero,ymm4[28,u],zero,ymm4[u],zero,ymm4[29],zero,ymm4[31,u],zero,ymm4[30]
2457; AVX512-NEXT:    vpor %ymm3, %ymm4, %ymm3
2458; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3]
2459; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,ymm1[26],zero,ymm1[28],zero,ymm1[30],zero,zero,ymm1[29],zero,ymm1[31],zero,zero
2460; AVX512-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27,u],zero,ymm2[26],zero,ymm2[28],zero,ymm2[30,u],zero,ymm2[29],zero,ymm2[31,u]
2461; AVX512-NEXT:    vpor %ymm1, %ymm2, %ymm1
2462; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
2463; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm3 ^ (ymm7 & (ymm1 ^ ymm3))
2464; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
2465; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
2466; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
2467; AVX512-NEXT:    vmovdqa %ymm0, 128(%r9)
2468; AVX512-NEXT:    vmovdqa64 %zmm8, 64(%r9)
2469; AVX512-NEXT:    vmovdqa64 %zmm6, (%r9)
2470; AVX512-NEXT:    vzeroupper
2471; AVX512-NEXT:    retq
2472;
2473; AVX512-FCP-LABEL: store_i8_stride5_vf32:
2474; AVX512-FCP:       # %bb.0:
2475; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm2
2476; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm3
2477; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm0
2478; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm1
2479; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm4
2480; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[u],zero,xmm4[u,10],zero,xmm4[12],zero,xmm4[u,11]
2481; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm6
2482; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero
2483; AVX512-FCP-NEXT:    vpor %xmm5, %xmm7, %xmm5
2484; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
2485; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm7
2486; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,xmm7[6],zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9],zero,xmm7[11,u],zero,xmm7[10],zero,xmm7[12]
2487; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm9
2488; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[6],zero,xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[11],zero,xmm9[u,10],zero,xmm9[12],zero
2489; AVX512-FCP-NEXT:    vpor %xmm8, %xmm10, %xmm8
2490; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
2491; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255]
2492; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm10 & (ymm8 ^ ymm5))
2493; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm5
2494; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
2495; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
2496; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1]
2497; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
2498; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
2499; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
2500; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
2501; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm7 ^ (ymm6 & (ymm4 ^ ymm7))
2502; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm4[0,1,2,3],zmm5[4,5,6,7]
2503; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3]
2504; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0]
2505; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm7, %zmm7
2506; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm5))
2507; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm1[u,u,u],zero,ymm1[13,u,u,u],zero,ymm1[14,u,u,u],zero,ymm1[15,u,u,u],zero,ymm1[16,u,u,u],zero,ymm1[17,u,u,u],zero,ymm1[18,u,u]
2508; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,13],zero,ymm0[u,u,u,14],zero,ymm0[u,u,u,15],zero,ymm0[u,u,u,16],zero,ymm0[u,u,u,17],zero,ymm0[u,u,u,18],zero,ymm0[u,u]
2509; AVX512-FCP-NEXT:    vpor %ymm5, %ymm8, %ymm5
2510; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm3[u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u,u],zero
2511; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[u,13],zero,ymm2[u,u,u,14],zero,ymm2[u,u,u,15],zero,ymm2[u,u,u,16],zero,ymm2[u,u,u,17],zero,ymm2[u,u,u,18],zero,ymm2[u,u,u,19]
2512; AVX512-FCP-NEXT:    vpor %ymm8, %ymm9, %ymm8
2513; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm10 & (ymm8 ^ ymm5))
2514; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero
2515; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21,u],zero,ymm3[20],zero,ymm3[22],zero,ymm3[24,u],zero,ymm3[23],zero,ymm3[25,u]
2516; AVX512-FCP-NEXT:    vpor %ymm5, %ymm9, %ymm5
2517; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3]
2518; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero
2519; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25]
2520; AVX512-FCP-NEXT:    vpor %ymm9, %ymm10, %ymm9
2521; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
2522; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm5))
2523; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm5
2524; AVX512-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7]
2525; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [4,0,5,5,5,5,0,6]
2526; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm8, %ymm8
2527; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
2528; AVX512-FCP-NEXT:    vpandn %ymm8, %ymm9, %ymm8
2529; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm4[12],zero,zero,zero,zero,ymm4[13],zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,ymm4[18],zero
2530; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm9, %zmm8
2531; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm5 & mem)
2532; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero
2533; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm3[26],zero,ymm3[28,u],zero,ymm3[u],zero,ymm3[29],zero,ymm3[31,u],zero,ymm3[30]
2534; AVX512-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
2535; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
2536; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero
2537; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[27,u],zero,ymm1[26],zero,ymm1[28],zero,ymm1[30,u],zero,ymm1[29],zero,ymm1[31,u]
2538; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
2539; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
2540; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm6 & (ymm0 ^ ymm2))
2541; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [6,6,6,0,7,7,7,7]
2542; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm1
2543; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
2544; AVX512-FCP-NEXT:    vmovdqa %ymm1, 128(%r9)
2545; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 64(%r9)
2546; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, (%r9)
2547; AVX512-FCP-NEXT:    vzeroupper
2548; AVX512-FCP-NEXT:    retq
2549;
2550; AVX512DQ-LABEL: store_i8_stride5_vf32:
2551; AVX512DQ:       # %bb.0:
2552; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm3
2553; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm4
2554; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm1
2555; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm2
2556; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm0
2557; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm5
2558; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[8],zero,xmm5[u,7],zero,xmm5[9],zero,xmm5[u],zero,xmm5[u,10],zero,xmm5[12],zero,xmm5[u,11]
2559; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm7
2560; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9,u,11,u],zero,xmm7[10],zero,xmm7[12,u],zero
2561; AVX512DQ-NEXT:    vpor %xmm6, %xmm8, %xmm6
2562; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
2563; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm8
2564; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm9 = zero,xmm8[6],zero,xmm8[8,u],zero,xmm8[7],zero,xmm8[9],zero,xmm8[11,u],zero,xmm8[10],zero,xmm8[12]
2565; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm10
2566; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[6],zero,xmm10[8],zero,xmm10[u,7],zero,xmm10[9],zero,xmm10[11],zero,xmm10[u,10],zero,xmm10[12],zero
2567; AVX512DQ-NEXT:    vpor %xmm9, %xmm11, %xmm9
2568; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1]
2569; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255]
2570; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm6 ^ (ymm11 & (ymm9 ^ ymm6))
2571; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm6
2572; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7]
2573; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
2574; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
2575; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
2576; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
2577; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
2578; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
2579; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm5 = ymm8 ^ (ymm7 & (ymm5 ^ ymm8))
2580; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
2581; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm6
2582; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0]
2583; AVX512DQ-NEXT:    vpermd %zmm6, %zmm8, %zmm6
2584; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
2585; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm5 = ymm2[u,u,u],zero,ymm2[13,u,u,u],zero,ymm2[14,u,u,u],zero,ymm2[15,u,u,u],zero,ymm2[16,u,u,u],zero,ymm2[17,u,u,u],zero,ymm2[18,u,u]
2586; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,13],zero,ymm1[u,u,u,14],zero,ymm1[u,u,u,15],zero,ymm1[u,u,u,16],zero,ymm1[u,u,u,17],zero,ymm1[u,u,u,18],zero,ymm1[u,u]
2587; AVX512DQ-NEXT:    vpor %ymm5, %ymm8, %ymm5
2588; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = ymm4[u],zero,ymm4[13,u,u,u],zero,ymm4[14,u,u,u],zero,ymm4[15,u,u,u],zero,ymm4[16,u,u,u],zero,ymm4[17,u,u,u],zero,ymm4[18,u,u,u],zero
2589; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm9 = ymm3[u,13],zero,ymm3[u,u,u,14],zero,ymm3[u,u,u,15],zero,ymm3[u,u,u,16],zero,ymm3[u,u,u,17],zero,ymm3[u,u,u,18],zero,ymm3[u,u,u,19]
2590; AVX512DQ-NEXT:    vpor %ymm8, %ymm9, %ymm8
2591; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm11 & (ymm8 ^ ymm5))
2592; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero,ymm3[25],zero,zero
2593; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21,u],zero,ymm4[20],zero,ymm4[22],zero,ymm4[24,u],zero,ymm4[23],zero,ymm4[25,u]
2594; AVX512DQ-NEXT:    vpor %ymm5, %ymm9, %ymm5
2595; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3]
2596; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero
2597; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm1[21],zero,ymm1[21,20],zero,ymm1[22],zero,ymm1[24],zero,ymm1[22,23],zero,ymm1[25]
2598; AVX512DQ-NEXT:    vpor %ymm9, %ymm10, %ymm9
2599; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
2600; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm5))
2601; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm5
2602; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7]
2603; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = ymm0[12],zero,zero,zero,zero,ymm0[13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,ymm0[18],zero
2604; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm9 = ymm0[0,2,1,1,4,6,5,5]
2605; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2]
2606; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
2607; AVX512DQ-NEXT:    vpandn %ymm9, %ymm10, %ymm9
2608; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
2609; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm5 & mem)
2610; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[26],zero,ymm3[28],zero,zero,ymm3[27],zero,ymm3[29],zero,ymm3[31],zero,zero,ymm3[30],zero
2611; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm4[26],zero,ymm4[28,u],zero,ymm4[u],zero,ymm4[29],zero,ymm4[31,u],zero,ymm4[30]
2612; AVX512DQ-NEXT:    vpor %ymm3, %ymm4, %ymm3
2613; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3]
2614; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,ymm1[26],zero,ymm1[28],zero,ymm1[30],zero,zero,ymm1[29],zero,ymm1[31],zero,zero
2615; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27,u],zero,ymm2[26],zero,ymm2[28],zero,ymm2[30,u],zero,ymm2[29],zero,ymm2[31,u]
2616; AVX512DQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
2617; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
2618; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm3 ^ (ymm7 & (ymm1 ^ ymm3))
2619; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
2620; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
2621; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
2622; AVX512DQ-NEXT:    vmovdqa %ymm0, 128(%r9)
2623; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 64(%r9)
2624; AVX512DQ-NEXT:    vmovdqa64 %zmm6, (%r9)
2625; AVX512DQ-NEXT:    vzeroupper
2626; AVX512DQ-NEXT:    retq
2627;
2628; AVX512DQ-FCP-LABEL: store_i8_stride5_vf32:
2629; AVX512DQ-FCP:       # %bb.0:
2630; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm2
2631; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm3
2632; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm0
2633; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm1
2634; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm4
2635; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[u],zero,xmm4[u,10],zero,xmm4[12],zero,xmm4[u,11]
2636; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm6
2637; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero
2638; AVX512DQ-FCP-NEXT:    vpor %xmm5, %xmm7, %xmm5
2639; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
2640; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm7
2641; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,xmm7[6],zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9],zero,xmm7[11,u],zero,xmm7[10],zero,xmm7[12]
2642; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm9
2643; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[6],zero,xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[11],zero,xmm9[u,10],zero,xmm9[12],zero
2644; AVX512DQ-FCP-NEXT:    vpor %xmm8, %xmm10, %xmm8
2645; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
2646; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255]
2647; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm10 & (ymm8 ^ ymm5))
2648; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm5
2649; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
2650; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
2651; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1]
2652; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
2653; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
2654; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
2655; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
2656; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm7 ^ (ymm6 & (ymm4 ^ ymm7))
2657; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm4[0,1,2,3],zmm5[4,5,6,7]
2658; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3]
2659; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0]
2660; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm7, %zmm7
2661; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm5))
2662; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm1[u,u,u],zero,ymm1[13,u,u,u],zero,ymm1[14,u,u,u],zero,ymm1[15,u,u,u],zero,ymm1[16,u,u,u],zero,ymm1[17,u,u,u],zero,ymm1[18,u,u]
2663; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,13],zero,ymm0[u,u,u,14],zero,ymm0[u,u,u,15],zero,ymm0[u,u,u,16],zero,ymm0[u,u,u,17],zero,ymm0[u,u,u,18],zero,ymm0[u,u]
2664; AVX512DQ-FCP-NEXT:    vpor %ymm5, %ymm8, %ymm5
2665; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm3[u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u,u],zero
2666; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[u,13],zero,ymm2[u,u,u,14],zero,ymm2[u,u,u,15],zero,ymm2[u,u,u,16],zero,ymm2[u,u,u,17],zero,ymm2[u,u,u,18],zero,ymm2[u,u,u,19]
2667; AVX512DQ-FCP-NEXT:    vpor %ymm8, %ymm9, %ymm8
2668; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm10 & (ymm8 ^ ymm5))
2669; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero
2670; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21,u],zero,ymm3[20],zero,ymm3[22],zero,ymm3[24,u],zero,ymm3[23],zero,ymm3[25,u]
2671; AVX512DQ-FCP-NEXT:    vpor %ymm5, %ymm9, %ymm5
2672; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3]
2673; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero
2674; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25]
2675; AVX512DQ-FCP-NEXT:    vpor %ymm9, %ymm10, %ymm9
2676; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
2677; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm5))
2678; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm5
2679; AVX512DQ-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7]
2680; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [4,0,5,5,5,5,0,6]
2681; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm8, %ymm8
2682; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
2683; AVX512DQ-FCP-NEXT:    vpandn %ymm8, %ymm9, %ymm8
2684; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm4[12],zero,zero,zero,zero,ymm4[13],zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,ymm4[18],zero
2685; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm9, %zmm8
2686; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm5 & mem)
2687; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero
2688; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm3[26],zero,ymm3[28,u],zero,ymm3[u],zero,ymm3[29],zero,ymm3[31,u],zero,ymm3[30]
2689; AVX512DQ-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
2690; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
2691; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero
2692; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[27,u],zero,ymm1[26],zero,ymm1[28],zero,ymm1[30,u],zero,ymm1[29],zero,ymm1[31,u]
2693; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
2694; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
2695; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm6 & (ymm0 ^ ymm2))
2696; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [6,6,6,0,7,7,7,7]
2697; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm1, %ymm1
2698; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
2699; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, 128(%r9)
2700; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 64(%r9)
2701; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, (%r9)
2702; AVX512DQ-FCP-NEXT:    vzeroupper
2703; AVX512DQ-FCP-NEXT:    retq
2704;
2705; AVX512BW-LABEL: store_i8_stride5_vf32:
2706; AVX512BW:       # %bb.0:
2707; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm4
2708; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm5
2709; AVX512BW-NEXT:    vmovdqa (%rdx), %ymm1
2710; AVX512BW-NEXT:    vmovdqa (%rcx), %ymm2
2711; AVX512BW-NEXT:    vmovdqa (%r8), %ymm0
2712; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm3
2713; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm3[8],zero,xmm3[u,7],zero,xmm3[9],zero,xmm3[u],zero,xmm3[u,10],zero,xmm3[12],zero,xmm3[u,11]
2714; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm7
2715; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9,u,11,u],zero,xmm7[10],zero,xmm7[12,u],zero
2716; AVX512BW-NEXT:    vpor %xmm6, %xmm8, %xmm6
2717; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm8
2718; AVX512BW-NEXT:    vmovdqa (%rcx), %xmm9
2719; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
2720; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
2721; AVX512BW-NEXT:    vinserti32x4 $2, %xmm6, %zmm10, %zmm6
2722; AVX512BW-NEXT:    vpermq {{.*#+}} zmm6 = zmm6[0,0,1,1,4,4,5,5]
2723; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm9 = zero,xmm9[6],zero,xmm9[8,u],zero,xmm9[7],zero,xmm9[9],zero,xmm9[11,u],zero,xmm9[10],zero,xmm9[12]
2724; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[6],zero,xmm8[8],zero,xmm8[u,7],zero,xmm8[9],zero,xmm8[11],zero,xmm8[u,10],zero,xmm8[12],zero
2725; AVX512BW-NEXT:    vpor %xmm9, %xmm8, %xmm8
2726; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
2727; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
2728; AVX512BW-NEXT:    vinserti32x4 $2, %xmm8, %zmm3, %zmm3
2729; AVX512BW-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[0,0,1,1,4,4,5,5]
2730; AVX512BW-NEXT:    movabsq $3570337559743967628, %rax # imm = 0x318C631818C6318C
2731; AVX512BW-NEXT:    kmovq %rax, %k1
2732; AVX512BW-NEXT:    vmovdqu8 %zmm6, %zmm3 {%k1}
2733; AVX512BW-NEXT:    vmovdqa (%r8), %xmm6
2734; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2]
2735; AVX512BW-NEXT:    vpermd %zmm6, %zmm7, %zmm6
2736; AVX512BW-NEXT:    movabsq $595056260442243600, %rax # imm = 0x842108421084210
2737; AVX512BW-NEXT:    kmovq %rax, %k1
2738; AVX512BW-NEXT:    vmovdqu8 %zmm6, %zmm3 {%k1}
2739; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,ymm1[12,13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,ymm1[14,15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,ymm1[16,17],zero,zero,zero,zero,ymm1[18],zero,zero,zero
2740; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero
2741; AVX512BW-NEXT:    vpor %ymm6, %ymm7, %ymm6
2742; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm1[21],zero,ymm1[21,20],zero,ymm1[22],zero,ymm1[24],zero,ymm1[22,23],zero,ymm1[25]
2743; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero
2744; AVX512BW-NEXT:    vpor %ymm7, %ymm8, %ymm7
2745; AVX512BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
2746; AVX512BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
2747; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [3,3,3,0,4,4,4,4]
2748; AVX512BW-NEXT:    vpermd %ymm4, %ymm7, %ymm7
2749; AVX512BW-NEXT:    movl $138547332, %eax # imm = 0x8421084
2750; AVX512BW-NEXT:    kmovd %eax, %k1
2751; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm7 {%k1} = ymm5[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u]
2752; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm5[21],zero,zero,ymm5[20],zero,ymm5[22],zero,ymm5[24],zero,zero,ymm5[23],zero,ymm5[25],zero
2753; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero
2754; AVX512BW-NEXT:    vpor %ymm8, %ymm9, %ymm8
2755; AVX512BW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
2756; AVX512BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
2757; AVX512BW-NEXT:    movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
2758; AVX512BW-NEXT:    kmovq %rax, %k1
2759; AVX512BW-NEXT:    vmovdqu8 %zmm6, %zmm7 {%k1}
2760; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,3,3,3,0,4,4,4]
2761; AVX512BW-NEXT:    vpermd %ymm0, %ymm6, %ymm6
2762; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm8 = ymm0[0,2,1,1,4,6,5,5]
2763; AVX512BW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2]
2764; AVX512BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm6
2765; AVX512BW-NEXT:    movabsq $1190112520884487201, %rax # imm = 0x1084210842108421
2766; AVX512BW-NEXT:    kmovq %rax, %k1
2767; AVX512BW-NEXT:    vmovdqu8 %zmm6, %zmm7 {%k1}
2768; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30]
2769; AVX512BW-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
2770; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7]
2771; AVX512BW-NEXT:    movl $1251232404, %eax # imm = 0x4A944A94
2772; AVX512BW-NEXT:    kmovd %eax, %k1
2773; AVX512BW-NEXT:    vmovdqu8 %ymm4, %ymm5 {%k1}
2774; AVX512BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm5[2,2,3,3]
2775; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,27,u,u,26,u,28,u,30,u,u,29,u,31,u]
2776; AVX512BW-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
2777; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7]
2778; AVX512BW-NEXT:    movl $693250386, %eax # imm = 0x29522952
2779; AVX512BW-NEXT:    kmovd %eax, %k1
2780; AVX512BW-NEXT:    vmovdqu8 %ymm1, %ymm2 {%k1}
2781; AVX512BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm2[2,2,3,3]
2782; AVX512BW-NEXT:    movl $415641996, %eax # imm = 0x18C6318C
2783; AVX512BW-NEXT:    kmovd %eax, %k1
2784; AVX512BW-NEXT:    vmovdqu8 %ymm4, %ymm1 {%k1}
2785; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
2786; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
2787; AVX512BW-NEXT:    movl $-2078209982, %eax # imm = 0x84210842
2788; AVX512BW-NEXT:    kmovd %eax, %k1
2789; AVX512BW-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
2790; AVX512BW-NEXT:    vmovdqa %ymm1, 128(%r9)
2791; AVX512BW-NEXT:    vmovdqa64 %zmm7, 64(%r9)
2792; AVX512BW-NEXT:    vmovdqa64 %zmm3, (%r9)
2793; AVX512BW-NEXT:    vzeroupper
2794; AVX512BW-NEXT:    retq
2795;
2796; AVX512BW-FCP-LABEL: store_i8_stride5_vf32:
2797; AVX512BW-FCP:       # %bb.0:
2798; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
2799; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %ymm3
2800; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %ymm0
2801; AVX512BW-FCP-NEXT:    vmovdqa (%rcx), %ymm2
2802; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm4
2803; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[u],zero,xmm4[u,10],zero,xmm4[12],zero,xmm4[u,11]
2804; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %xmm6
2805; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero
2806; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm7, %xmm5
2807; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %xmm7
2808; AVX512BW-FCP-NEXT:    vmovdqa (%rcx), %xmm8
2809; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
2810; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
2811; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm9, %zmm5
2812; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[0,0,1,1,4,4,5,5]
2813; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,xmm8[6],zero,xmm8[8,u],zero,xmm8[7],zero,xmm8[9],zero,xmm8[11,u],zero,xmm8[10],zero,xmm8[12]
2814; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[6],zero,xmm7[8],zero,xmm7[u,7],zero,xmm7[9],zero,xmm7[11],zero,xmm7[u,10],zero,xmm7[12],zero
2815; AVX512BW-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
2816; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
2817; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
2818; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm7, %zmm4, %zmm4
2819; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm4 = zmm4[0,0,1,1,4,4,5,5]
2820; AVX512BW-FCP-NEXT:    movabsq $3570337559743967628, %rax # imm = 0x318C631818C6318C
2821; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
2822; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm5, %zmm4 {%k1}
2823; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3]
2824; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10]
2825; AVX512BW-FCP-NEXT:    vpermd %zmm5, %zmm6, %zmm6
2826; AVX512BW-FCP-NEXT:    movabsq $595056260442243600, %rax # imm = 0x842108421084210
2827; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
2828; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm6, %zmm4 {%k1}
2829; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,ymm0[12,13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,ymm0[18],zero,zero,zero
2830; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero
2831; AVX512BW-FCP-NEXT:    vpor %ymm6, %ymm7, %ymm6
2832; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25]
2833; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero
2834; AVX512BW-FCP-NEXT:    vpor %ymm7, %ymm8, %ymm7
2835; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
2836; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
2837; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [3,3,3,0,4,4,4,4]
2838; AVX512BW-FCP-NEXT:    vpermd %ymm1, %ymm7, %ymm7
2839; AVX512BW-FCP-NEXT:    movl $138547332, %eax # imm = 0x8421084
2840; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
2841; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 {%k1} = ymm3[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u]
2842; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero,ymm3[25],zero
2843; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero
2844; AVX512BW-FCP-NEXT:    vpor %ymm8, %ymm9, %ymm8
2845; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
2846; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
2847; AVX512BW-FCP-NEXT:    movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
2848; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
2849; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm6, %zmm7 {%k1}
2850; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,12,14,13,13,13,13,12,14]
2851; AVX512BW-FCP-NEXT:    vpermd %zmm5, %zmm6, %zmm6
2852; AVX512BW-FCP-NEXT:    movabsq $1190112520884487201, %rax # imm = 0x1084210842108421
2853; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
2854; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm6, %zmm7 {%k1}
2855; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm3[26],zero,ymm3[28],zero,zero,zero,zero,ymm3[29],zero,ymm3[31],zero,zero,ymm3[30]
2856; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[26],zero,ymm1[28],zero,zero,ymm1[27],zero,ymm1[29],zero,ymm1[31],zero,zero,ymm1[30],zero
2857; AVX512BW-FCP-NEXT:    vpor %ymm3, %ymm1, %ymm1
2858; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
2859; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero
2860; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero
2861; AVX512BW-FCP-NEXT:    vpor %ymm2, %ymm0, %ymm0
2862; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
2863; AVX512BW-FCP-NEXT:    movl $415641996, %eax # imm = 0x18C6318C
2864; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
2865; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k1}
2866; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [6,6,6,6,7,7,7,7]
2867; AVX512BW-FCP-NEXT:    vpermd %ymm5, %ymm1, %ymm1
2868; AVX512BW-FCP-NEXT:    movl $-2078209982, %eax # imm = 0x84210842
2869; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
2870; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k1}
2871; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, 128(%r9)
2872; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, 64(%r9)
2873; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, (%r9)
2874; AVX512BW-FCP-NEXT:    vzeroupper
2875; AVX512BW-FCP-NEXT:    retq
2876;
2877; AVX512DQ-BW-LABEL: store_i8_stride5_vf32:
2878; AVX512DQ-BW:       # %bb.0:
2879; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm4
2880; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %ymm5
2881; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %ymm1
2882; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %ymm2
2883; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %ymm0
2884; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm3
2885; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm3[8],zero,xmm3[u,7],zero,xmm3[9],zero,xmm3[u],zero,xmm3[u,10],zero,xmm3[12],zero,xmm3[u,11]
2886; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %xmm7
2887; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9,u,11,u],zero,xmm7[10],zero,xmm7[12,u],zero
2888; AVX512DQ-BW-NEXT:    vpor %xmm6, %xmm8, %xmm6
2889; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm8
2890; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %xmm9
2891; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
2892; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
2893; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm6, %zmm10, %zmm6
2894; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm6 = zmm6[0,0,1,1,4,4,5,5]
2895; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm9 = zero,xmm9[6],zero,xmm9[8,u],zero,xmm9[7],zero,xmm9[9],zero,xmm9[11,u],zero,xmm9[10],zero,xmm9[12]
2896; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[6],zero,xmm8[8],zero,xmm8[u,7],zero,xmm8[9],zero,xmm8[11],zero,xmm8[u,10],zero,xmm8[12],zero
2897; AVX512DQ-BW-NEXT:    vpor %xmm9, %xmm8, %xmm8
2898; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
2899; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
2900; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm8, %zmm3, %zmm3
2901; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[0,0,1,1,4,4,5,5]
2902; AVX512DQ-BW-NEXT:    movabsq $3570337559743967628, %rax # imm = 0x318C631818C6318C
2903; AVX512DQ-BW-NEXT:    kmovq %rax, %k1
2904; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm6, %zmm3 {%k1}
2905; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %xmm6
2906; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2]
2907; AVX512DQ-BW-NEXT:    vpermd %zmm6, %zmm7, %zmm6
2908; AVX512DQ-BW-NEXT:    movabsq $595056260442243600, %rax # imm = 0x842108421084210
2909; AVX512DQ-BW-NEXT:    kmovq %rax, %k1
2910; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm6, %zmm3 {%k1}
2911; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,ymm1[12,13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,ymm1[14,15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,ymm1[16,17],zero,zero,zero,zero,ymm1[18],zero,zero,zero
2912; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero
2913; AVX512DQ-BW-NEXT:    vpor %ymm6, %ymm7, %ymm6
2914; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm1[21],zero,ymm1[21,20],zero,ymm1[22],zero,ymm1[24],zero,ymm1[22,23],zero,ymm1[25]
2915; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero
2916; AVX512DQ-BW-NEXT:    vpor %ymm7, %ymm8, %ymm7
2917; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
2918; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
2919; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [3,3,3,0,4,4,4,4]
2920; AVX512DQ-BW-NEXT:    vpermd %ymm4, %ymm7, %ymm7
2921; AVX512DQ-BW-NEXT:    movl $138547332, %eax # imm = 0x8421084
2922; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
2923; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm7 {%k1} = ymm5[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u]
2924; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm5[21],zero,zero,ymm5[20],zero,ymm5[22],zero,ymm5[24],zero,zero,ymm5[23],zero,ymm5[25],zero
2925; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero
2926; AVX512DQ-BW-NEXT:    vpor %ymm8, %ymm9, %ymm8
2927; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
2928; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
2929; AVX512DQ-BW-NEXT:    movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
2930; AVX512DQ-BW-NEXT:    kmovq %rax, %k1
2931; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm6, %zmm7 {%k1}
2932; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [3,3,3,3,0,4,4,4]
2933; AVX512DQ-BW-NEXT:    vpermd %ymm0, %ymm6, %ymm6
2934; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm8 = ymm0[0,2,1,1,4,6,5,5]
2935; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2]
2936; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm6
2937; AVX512DQ-BW-NEXT:    movabsq $1190112520884487201, %rax # imm = 0x1084210842108421
2938; AVX512DQ-BW-NEXT:    kmovq %rax, %k1
2939; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm6, %zmm7 {%k1}
2940; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30]
2941; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
2942; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7]
2943; AVX512DQ-BW-NEXT:    movl $1251232404, %eax # imm = 0x4A944A94
2944; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
2945; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm4, %ymm5 {%k1}
2946; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm5[2,2,3,3]
2947; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,27,u,u,26,u,28,u,30,u,u,29,u,31,u]
2948; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
2949; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7]
2950; AVX512DQ-BW-NEXT:    movl $693250386, %eax # imm = 0x29522952
2951; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
2952; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm1, %ymm2 {%k1}
2953; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm2[2,2,3,3]
2954; AVX512DQ-BW-NEXT:    movl $415641996, %eax # imm = 0x18C6318C
2955; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
2956; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm4, %ymm1 {%k1}
2957; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
2958; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
2959; AVX512DQ-BW-NEXT:    movl $-2078209982, %eax # imm = 0x84210842
2960; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
2961; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
2962; AVX512DQ-BW-NEXT:    vmovdqa %ymm1, 128(%r9)
2963; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, 64(%r9)
2964; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, (%r9)
2965; AVX512DQ-BW-NEXT:    vzeroupper
2966; AVX512DQ-BW-NEXT:    retq
2967;
2968; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf32:
2969; AVX512DQ-BW-FCP:       # %bb.0:
2970; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm1
2971; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %ymm3
2972; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %ymm0
2973; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rcx), %ymm2
2974; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm4
2975; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[u],zero,xmm4[u,10],zero,xmm4[12],zero,xmm4[u,11]
2976; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %xmm6
2977; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero
2978; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm7, %xmm5
2979; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %xmm7
2980; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rcx), %xmm8
2981; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
2982; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
2983; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm9, %zmm5
2984; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[0,0,1,1,4,4,5,5]
2985; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = zero,xmm8[6],zero,xmm8[8,u],zero,xmm8[7],zero,xmm8[9],zero,xmm8[11,u],zero,xmm8[10],zero,xmm8[12]
2986; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[6],zero,xmm7[8],zero,xmm7[u,7],zero,xmm7[9],zero,xmm7[11],zero,xmm7[u,10],zero,xmm7[12],zero
2987; AVX512DQ-BW-FCP-NEXT:    vpor %xmm7, %xmm8, %xmm7
2988; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
2989; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
2990; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm7, %zmm4, %zmm4
2991; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm4 = zmm4[0,0,1,1,4,4,5,5]
2992; AVX512DQ-BW-FCP-NEXT:    movabsq $3570337559743967628, %rax # imm = 0x318C631818C6318C
2993; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
2994; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm5, %zmm4 {%k1}
2995; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3]
2996; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10]
2997; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm5, %zmm6, %zmm6
2998; AVX512DQ-BW-FCP-NEXT:    movabsq $595056260442243600, %rax # imm = 0x842108421084210
2999; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
3000; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm6, %zmm4 {%k1}
3001; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,ymm0[12,13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,ymm0[18],zero,zero,zero
3002; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero
3003; AVX512DQ-BW-FCP-NEXT:    vpor %ymm6, %ymm7, %ymm6
3004; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25]
3005; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero
3006; AVX512DQ-BW-FCP-NEXT:    vpor %ymm7, %ymm8, %ymm7
3007; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
3008; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
3009; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [3,3,3,0,4,4,4,4]
3010; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm1, %ymm7, %ymm7
3011; AVX512DQ-BW-FCP-NEXT:    movl $138547332, %eax # imm = 0x8421084
3012; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
3013; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 {%k1} = ymm3[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u]
3014; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero,ymm3[25],zero
3015; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero
3016; AVX512DQ-BW-FCP-NEXT:    vpor %ymm8, %ymm9, %ymm8
3017; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
3018; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
3019; AVX512DQ-BW-FCP-NEXT:    movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
3020; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
3021; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm6, %zmm7 {%k1}
3022; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,12,14,13,13,13,13,12,14]
3023; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm5, %zmm6, %zmm6
3024; AVX512DQ-BW-FCP-NEXT:    movabsq $1190112520884487201, %rax # imm = 0x1084210842108421
3025; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
3026; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm6, %zmm7 {%k1}
3027; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm3[26],zero,ymm3[28],zero,zero,zero,zero,ymm3[29],zero,ymm3[31],zero,zero,ymm3[30]
3028; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[26],zero,ymm1[28],zero,zero,ymm1[27],zero,ymm1[29],zero,ymm1[31],zero,zero,ymm1[30],zero
3029; AVX512DQ-BW-FCP-NEXT:    vpor %ymm3, %ymm1, %ymm1
3030; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
3031; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero
3032; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero
3033; AVX512DQ-BW-FCP-NEXT:    vpor %ymm2, %ymm0, %ymm0
3034; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
3035; AVX512DQ-BW-FCP-NEXT:    movl $415641996, %eax # imm = 0x18C6318C
3036; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
3037; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k1}
3038; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [6,6,6,6,7,7,7,7]
3039; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm5, %ymm1, %ymm1
3040; AVX512DQ-BW-FCP-NEXT:    movl $-2078209982, %eax # imm = 0x84210842
3041; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
3042; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k1}
3043; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, 128(%r9)
3044; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, 64(%r9)
3045; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, (%r9)
3046; AVX512DQ-BW-FCP-NEXT:    vzeroupper
3047; AVX512DQ-BW-FCP-NEXT:    retq
3048  %in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 64
3049  %in.vec1 = load <32 x i8>, ptr %in.vecptr1, align 64
3050  %in.vec2 = load <32 x i8>, ptr %in.vecptr2, align 64
3051  %in.vec3 = load <32 x i8>, ptr %in.vecptr3, align 64
3052  %in.vec4 = load <32 x i8>, ptr %in.vecptr4, align 64
3053  %1 = shufflevector <32 x i8> %in.vec0, <32 x i8> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
3054  %2 = shufflevector <32 x i8> %in.vec2, <32 x i8> %in.vec3, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
3055  %3 = shufflevector <64 x i8> %1, <64 x i8> %2, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
3056  %4 = shufflevector <32 x i8> %in.vec4, <32 x i8> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3057  %5 = shufflevector <128 x i8> %3, <128 x i8> %4, <160 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159>
3058  %interleaved.vec = shufflevector <160 x i8> %5, <160 x i8> poison, <160 x i32> <i32 0, i32 32, i32 64, i32 96, i32 128, i32 1, i32 33, i32 65, i32 97, i32 129, i32 2, i32 34, i32 66, i32 98, i32 130, i32 3, i32 35, i32 67, i32 99, i32 131, i32 4, i32 36, i32 68, i32 100, i32 132, i32 5, i32 37, i32 69, i32 101, i32 133, i32 6, i32 38, i32 70, i32 102, i32 134, i32 7, i32 39, i32 71, i32 103, i32 135, i32 8, i32 40, i32 72, i32 104, i32 136, i32 9, i32 41, i32 73, i32 105, i32 137, i32 10, i32 42, i32 74, i32 106, i32 138, i32 11, i32 43, i32 75, i32 107, i32 139, i32 12, i32 44, i32 76, i32 108, i32 140, i32 13, i32 45, i32 77, i32 109, i32 141, i32 14, i32 46, i32 78, i32 110, i32 142, i32 15, i32 47, i32 79, i32 111, i32 143, i32 16, i32 48, i32 80, i32 112, i32 144, i32 17, i32 49, i32 81, i32 113, i32 145, i32 18, i32 50, i32 82, i32 114, i32 146, i32 19, i32 51, i32 83, i32 115, i32 147, i32 20, i32 52, i32 84, i32 116, i32 148, i32 21, i32 53, i32 85, i32 117, i32 149, i32 22, i32 54, i32 86, i32 118, i32 150, i32 23, i32 55, i32 87, i32 119, i32 151, i32 24, i32 56, i32 88, i32 120, i32 152, i32 25, i32 57, i32 89, i32 121, i32 153, i32 26, i32 58, i32 90, i32 122, i32 154, i32 27, i32 59, i32 91, i32 123, i32 155, i32 28, i32 60, i32 92, i32 124, i32 156, i32 29, i32 61, i32 93, i32 125, i32 157, i32 30, i32 62, i32 94, i32 126, i32 158, i32 31, i32 63, i32 95, i32 127, i32 159>
3059  store <160 x i8> %interleaved.vec, ptr %out.vec, align 64
3060  ret void
3061}
3062
3063define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind {
3064; SSE-LABEL: store_i8_stride5_vf64:
3065; SSE:       # %bb.0:
3066; SSE-NEXT:    subq $504, %rsp # imm = 0x1F8
3067; SSE-NEXT:    movdqa (%rdi), %xmm7
3068; SSE-NEXT:    movdqa (%rsi), %xmm9
3069; SSE-NEXT:    movdqa 16(%rsi), %xmm14
3070; SSE-NEXT:    movdqa (%rdx), %xmm0
3071; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
3072; SSE-NEXT:    movdqa 16(%rdx), %xmm11
3073; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3074; SSE-NEXT:    movdqa (%rcx), %xmm10
3075; SSE-NEXT:    movdqa 16(%rcx), %xmm6
3076; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3077; SSE-NEXT:    movdqa (%r8), %xmm13
3078; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
3079; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
3080; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
3081; SSE-NEXT:    pand %xmm12, %xmm0
3082; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm10[2,1,2,3]
3083; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3084; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,0,3]
3085; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
3086; SSE-NEXT:    movdqa %xmm12, %xmm4
3087; SSE-NEXT:    pandn %xmm1, %xmm4
3088; SSE-NEXT:    por %xmm0, %xmm4
3089; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255]
3090; SSE-NEXT:    pand %xmm8, %xmm4
3091; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,2,2]
3092; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
3093; SSE-NEXT:    pand %xmm2, %xmm0
3094; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm9[0,1,2,1]
3095; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
3096; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
3097; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,0,3,4,5,6,7]
3098; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,0]
3099; SSE-NEXT:    movdqa %xmm2, %xmm5
3100; SSE-NEXT:    pandn %xmm1, %xmm5
3101; SSE-NEXT:    por %xmm0, %xmm5
3102; SSE-NEXT:    movdqa %xmm8, %xmm0
3103; SSE-NEXT:    pandn %xmm5, %xmm0
3104; SSE-NEXT:    por %xmm4, %xmm0
3105; SSE-NEXT:    movdqa {{.*#+}} xmm15 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
3106; SSE-NEXT:    pand %xmm15, %xmm0
3107; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm13[1,1,2,2]
3108; SSE-NEXT:    movdqa %xmm15, %xmm3
3109; SSE-NEXT:    pandn %xmm1, %xmm3
3110; SSE-NEXT:    por %xmm0, %xmm3
3111; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3112; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7]
3113; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
3114; SSE-NEXT:    pand %xmm12, %xmm0
3115; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[2,1,2,3]
3116; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3117; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,0,3]
3118; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
3119; SSE-NEXT:    movdqa %xmm12, %xmm5
3120; SSE-NEXT:    pandn %xmm1, %xmm5
3121; SSE-NEXT:    por %xmm0, %xmm5
3122; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[0,1,2,1]
3123; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
3124; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
3125; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,0,3,4,5,6,7]
3126; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
3127; SSE-NEXT:    movdqa %xmm2, %xmm1
3128; SSE-NEXT:    pandn %xmm0, %xmm1
3129; SSE-NEXT:    movdqa 16(%rdi), %xmm0
3130; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3131; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
3132; SSE-NEXT:    pand %xmm2, %xmm0
3133; SSE-NEXT:    por %xmm0, %xmm1
3134; SSE-NEXT:    movdqa %xmm8, %xmm0
3135; SSE-NEXT:    pandn %xmm1, %xmm0
3136; SSE-NEXT:    pand %xmm8, %xmm5
3137; SSE-NEXT:    por %xmm5, %xmm0
3138; SSE-NEXT:    movdqa 16(%r8), %xmm1
3139; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3140; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
3141; SSE-NEXT:    movdqa %xmm15, %xmm3
3142; SSE-NEXT:    pandn %xmm1, %xmm3
3143; SSE-NEXT:    pand %xmm15, %xmm0
3144; SSE-NEXT:    por %xmm0, %xmm3
3145; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3146; SSE-NEXT:    movdqa 32(%rcx), %xmm0
3147; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3148; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
3149; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3150; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,0,3]
3151; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
3152; SSE-NEXT:    movdqa %xmm12, %xmm1
3153; SSE-NEXT:    pandn %xmm0, %xmm1
3154; SSE-NEXT:    movdqa 32(%rdx), %xmm0
3155; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3156; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
3157; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
3158; SSE-NEXT:    pand %xmm12, %xmm0
3159; SSE-NEXT:    por %xmm0, %xmm1
3160; SSE-NEXT:    movdqa 32(%rsi), %xmm11
3161; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[0,1,2,1]
3162; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
3163; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
3164; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,0,3,4,5,6,7]
3165; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
3166; SSE-NEXT:    movdqa %xmm2, %xmm5
3167; SSE-NEXT:    pandn %xmm0, %xmm5
3168; SSE-NEXT:    movdqa 32(%rdi), %xmm0
3169; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3170; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
3171; SSE-NEXT:    pand %xmm2, %xmm0
3172; SSE-NEXT:    por %xmm0, %xmm5
3173; SSE-NEXT:    movdqa %xmm8, %xmm0
3174; SSE-NEXT:    pandn %xmm5, %xmm0
3175; SSE-NEXT:    pand %xmm8, %xmm1
3176; SSE-NEXT:    por %xmm1, %xmm0
3177; SSE-NEXT:    movdqa 32(%r8), %xmm1
3178; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3179; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
3180; SSE-NEXT:    movdqa %xmm15, %xmm3
3181; SSE-NEXT:    pandn %xmm1, %xmm3
3182; SSE-NEXT:    pand %xmm15, %xmm0
3183; SSE-NEXT:    por %xmm0, %xmm3
3184; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3185; SSE-NEXT:    movdqa 48(%rcx), %xmm0
3186; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3187; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
3188; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
3189; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,0,3]
3190; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
3191; SSE-NEXT:    movdqa %xmm12, %xmm1
3192; SSE-NEXT:    pandn %xmm0, %xmm1
3193; SSE-NEXT:    movdqa 48(%rdx), %xmm0
3194; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3195; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
3196; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
3197; SSE-NEXT:    pand %xmm12, %xmm0
3198; SSE-NEXT:    por %xmm0, %xmm1
3199; SSE-NEXT:    movdqa 48(%rsi), %xmm0
3200; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3201; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
3202; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
3203; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
3204; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,0,3,4,5,6,7]
3205; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
3206; SSE-NEXT:    movdqa %xmm2, %xmm5
3207; SSE-NEXT:    pandn %xmm0, %xmm5
3208; SSE-NEXT:    movdqa 48(%rdi), %xmm0
3209; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3210; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
3211; SSE-NEXT:    pand %xmm2, %xmm0
3212; SSE-NEXT:    por %xmm0, %xmm5
3213; SSE-NEXT:    movdqa %xmm8, %xmm0
3214; SSE-NEXT:    pandn %xmm5, %xmm0
3215; SSE-NEXT:    pand %xmm8, %xmm1
3216; SSE-NEXT:    por %xmm1, %xmm0
3217; SSE-NEXT:    movdqa 48(%r8), %xmm1
3218; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3219; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
3220; SSE-NEXT:    movdqa %xmm15, %xmm5
3221; SSE-NEXT:    pandn %xmm1, %xmm5
3222; SSE-NEXT:    pand %xmm15, %xmm0
3223; SSE-NEXT:    por %xmm0, %xmm5
3224; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3225; SSE-NEXT:    movdqa %xmm10, %xmm0
3226; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
3227; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3228; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
3229; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,0,4,5,6,7]
3230; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
3231; SSE-NEXT:    movdqa %xmm15, %xmm1
3232; SSE-NEXT:    pandn %xmm0, %xmm1
3233; SSE-NEXT:    movdqa (%rsp), %xmm3 # 16-byte Reload
3234; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm3[1,2,2,3,4,5,6,7]
3235; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3236; SSE-NEXT:    pand %xmm15, %xmm0
3237; SSE-NEXT:    por %xmm0, %xmm1
3238; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
3239; SSE-NEXT:    movdqa %xmm6, %xmm0
3240; SSE-NEXT:    pandn %xmm1, %xmm0
3241; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3242; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1]
3243; SSE-NEXT:    movdqa %xmm2, %xmm5
3244; SSE-NEXT:    pandn %xmm1, %xmm5
3245; SSE-NEXT:    movdqa %xmm9, %xmm1
3246; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
3247; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3248; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
3249; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,0,3,4,5,6,7]
3250; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6]
3251; SSE-NEXT:    pand %xmm2, %xmm1
3252; SSE-NEXT:    por %xmm5, %xmm1
3253; SSE-NEXT:    pand %xmm6, %xmm1
3254; SSE-NEXT:    por %xmm0, %xmm1
3255; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
3256; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3257; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm13[0,0,1,1]
3258; SSE-NEXT:    movdqa %xmm4, %xmm0
3259; SSE-NEXT:    pandn %xmm5, %xmm0
3260; SSE-NEXT:    pand %xmm4, %xmm1
3261; SSE-NEXT:    por %xmm1, %xmm0
3262; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3263; SSE-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
3264; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3265; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm9[0,1,2,1]
3266; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7]
3267; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,7]
3268; SSE-NEXT:    movdqa %xmm4, %xmm5
3269; SSE-NEXT:    pandn %xmm1, %xmm5
3270; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3]
3271; SSE-NEXT:    pand %xmm4, %xmm1
3272; SSE-NEXT:    por %xmm1, %xmm5
3273; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255]
3274; SSE-NEXT:    movdqa %xmm8, %xmm7
3275; SSE-NEXT:    pandn %xmm5, %xmm7
3276; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,5,6,6,7]
3277; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2]
3278; SSE-NEXT:    movdqa %xmm2, %xmm9
3279; SSE-NEXT:    pandn %xmm5, %xmm9
3280; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
3281; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3282; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm10[0,1,2,1]
3283; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[1,1,2,3,4,5,6,7]
3284; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,4]
3285; SSE-NEXT:    pand %xmm2, %xmm5
3286; SSE-NEXT:    por %xmm9, %xmm5
3287; SSE-NEXT:    pand %xmm8, %xmm5
3288; SSE-NEXT:    por %xmm7, %xmm5
3289; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm13[2,2,2,2]
3290; SSE-NEXT:    movdqa %xmm12, %xmm0
3291; SSE-NEXT:    pandn %xmm7, %xmm0
3292; SSE-NEXT:    pand %xmm12, %xmm5
3293; SSE-NEXT:    por %xmm5, %xmm0
3294; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3295; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3296; SSE-NEXT:    movdqa %xmm13, %xmm0
3297; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
3298; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3299; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[2,1,2,3]
3300; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,0,4,5,6,7]
3301; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
3302; SSE-NEXT:    movdqa %xmm15, %xmm7
3303; SSE-NEXT:    pandn %xmm5, %xmm7
3304; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3305; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm10[1,2,2,3,4,5,6,7]
3306; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
3307; SSE-NEXT:    pand %xmm15, %xmm5
3308; SSE-NEXT:    por %xmm5, %xmm7
3309; SSE-NEXT:    movdqa %xmm6, %xmm5
3310; SSE-NEXT:    pandn %xmm7, %xmm5
3311; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3312; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
3313; SSE-NEXT:    movdqa %xmm2, %xmm9
3314; SSE-NEXT:    pandn %xmm7, %xmm9
3315; SSE-NEXT:    movdqa %xmm14, %xmm1
3316; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
3317; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3318; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[2,1,2,3]
3319; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[3,1,0,3,4,5,6,7]
3320; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,6]
3321; SSE-NEXT:    pand %xmm2, %xmm7
3322; SSE-NEXT:    por %xmm9, %xmm7
3323; SSE-NEXT:    pand %xmm6, %xmm7
3324; SSE-NEXT:    por %xmm5, %xmm7
3325; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3326; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,0,1,1]
3327; SSE-NEXT:    movdqa %xmm4, %xmm3
3328; SSE-NEXT:    pandn %xmm5, %xmm3
3329; SSE-NEXT:    pand %xmm4, %xmm7
3330; SSE-NEXT:    por %xmm7, %xmm3
3331; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3332; SSE-NEXT:    punpckhbw {{.*#+}} xmm14 = xmm14[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
3333; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3334; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm14[0,1,2,1]
3335; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7]
3336; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,4,7]
3337; SSE-NEXT:    movdqa %xmm4, %xmm7
3338; SSE-NEXT:    pandn %xmm5, %xmm7
3339; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3]
3340; SSE-NEXT:    pand %xmm4, %xmm5
3341; SSE-NEXT:    por %xmm5, %xmm7
3342; SSE-NEXT:    movdqa %xmm8, %xmm5
3343; SSE-NEXT:    pandn %xmm7, %xmm5
3344; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,5,6,6,7]
3345; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[2,2,2,2]
3346; SSE-NEXT:    movdqa %xmm2, %xmm9
3347; SSE-NEXT:    pandn %xmm7, %xmm9
3348; SSE-NEXT:    movdqa %xmm13, %xmm0
3349; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15]
3350; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3351; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[0,1,2,1]
3352; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[1,1,2,3,4,5,6,7]
3353; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,4]
3354; SSE-NEXT:    pand %xmm2, %xmm7
3355; SSE-NEXT:    por %xmm9, %xmm7
3356; SSE-NEXT:    pand %xmm8, %xmm7
3357; SSE-NEXT:    por %xmm5, %xmm7
3358; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,2,2,2]
3359; SSE-NEXT:    movdqa %xmm12, %xmm0
3360; SSE-NEXT:    pandn %xmm5, %xmm0
3361; SSE-NEXT:    pand %xmm12, %xmm7
3362; SSE-NEXT:    por %xmm7, %xmm0
3363; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3364; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3365; SSE-NEXT:    movdqa %xmm3, %xmm0
3366; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
3367; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3368; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[2,1,2,3]
3369; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,0,4,5,6,7]
3370; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
3371; SSE-NEXT:    movdqa %xmm15, %xmm7
3372; SSE-NEXT:    pandn %xmm5, %xmm7
3373; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3374; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm10[1,2,2,3,4,5,6,7]
3375; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
3376; SSE-NEXT:    pand %xmm15, %xmm5
3377; SSE-NEXT:    por %xmm5, %xmm7
3378; SSE-NEXT:    movdqa %xmm6, %xmm5
3379; SSE-NEXT:    pandn %xmm7, %xmm5
3380; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3381; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm13[1,1,1,1]
3382; SSE-NEXT:    movdqa %xmm2, %xmm9
3383; SSE-NEXT:    pandn %xmm7, %xmm9
3384; SSE-NEXT:    movdqa %xmm11, %xmm14
3385; SSE-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
3386; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm14[2,1,2,3]
3387; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[3,1,0,3,4,5,6,7]
3388; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,6]
3389; SSE-NEXT:    pand %xmm2, %xmm7
3390; SSE-NEXT:    por %xmm9, %xmm7
3391; SSE-NEXT:    pand %xmm6, %xmm7
3392; SSE-NEXT:    por %xmm5, %xmm7
3393; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3394; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1]
3395; SSE-NEXT:    movdqa %xmm4, %xmm1
3396; SSE-NEXT:    pandn %xmm5, %xmm1
3397; SSE-NEXT:    pand %xmm4, %xmm7
3398; SSE-NEXT:    por %xmm7, %xmm1
3399; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3400; SSE-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
3401; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3402; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm11[0,1,2,1]
3403; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7]
3404; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,4,7]
3405; SSE-NEXT:    movdqa %xmm4, %xmm7
3406; SSE-NEXT:    pandn %xmm5, %xmm7
3407; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm13[2,2,3,3]
3408; SSE-NEXT:    pand %xmm4, %xmm5
3409; SSE-NEXT:    por %xmm5, %xmm7
3410; SSE-NEXT:    movdqa %xmm8, %xmm5
3411; SSE-NEXT:    pandn %xmm7, %xmm5
3412; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,5,6,6,7]
3413; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[2,2,2,2]
3414; SSE-NEXT:    movdqa %xmm2, %xmm9
3415; SSE-NEXT:    pandn %xmm7, %xmm9
3416; SSE-NEXT:    movdqa %xmm3, %xmm1
3417; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
3418; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3419; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[0,1,2,1]
3420; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[1,1,2,3,4,5,6,7]
3421; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,4]
3422; SSE-NEXT:    pand %xmm2, %xmm7
3423; SSE-NEXT:    por %xmm9, %xmm7
3424; SSE-NEXT:    pand %xmm8, %xmm7
3425; SSE-NEXT:    por %xmm5, %xmm7
3426; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[2,2,2,2]
3427; SSE-NEXT:    movdqa %xmm12, %xmm0
3428; SSE-NEXT:    pandn %xmm5, %xmm0
3429; SSE-NEXT:    pand %xmm12, %xmm7
3430; SSE-NEXT:    por %xmm7, %xmm0
3431; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3432; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3433; SSE-NEXT:    movdqa %xmm0, %xmm1
3434; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3435; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3436; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,1,2,3]
3437; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,0,4,5,6,7]
3438; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
3439; SSE-NEXT:    movdqa %xmm15, %xmm7
3440; SSE-NEXT:    pandn %xmm5, %xmm7
3441; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3442; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm10[1,2,2,3,4,5,6,7]
3443; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
3444; SSE-NEXT:    pand %xmm15, %xmm5
3445; SSE-NEXT:    por %xmm5, %xmm7
3446; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
3447; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm11[1,1,1,1]
3448; SSE-NEXT:    movdqa %xmm2, %xmm9
3449; SSE-NEXT:    pandn %xmm5, %xmm9
3450; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3451; SSE-NEXT:    movdqa %xmm3, %xmm1
3452; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
3453; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3454; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,1,2,3]
3455; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,1,0,3,4,5,6,7]
3456; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,6]
3457; SSE-NEXT:    pand %xmm2, %xmm5
3458; SSE-NEXT:    por %xmm9, %xmm5
3459; SSE-NEXT:    pand %xmm6, %xmm5
3460; SSE-NEXT:    pandn %xmm7, %xmm6
3461; SSE-NEXT:    por %xmm5, %xmm6
3462; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3463; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm9[0,0,1,1]
3464; SSE-NEXT:    movdqa %xmm4, %xmm1
3465; SSE-NEXT:    pandn %xmm5, %xmm1
3466; SSE-NEXT:    pand %xmm4, %xmm6
3467; SSE-NEXT:    por %xmm6, %xmm1
3468; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3469; SSE-NEXT:    movdqa %xmm3, %xmm1
3470; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
3471; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3472; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,1,2,1]
3473; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7]
3474; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,4,7]
3475; SSE-NEXT:    movdqa %xmm4, %xmm6
3476; SSE-NEXT:    pandn %xmm5, %xmm6
3477; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm11[2,2,3,3]
3478; SSE-NEXT:    pand %xmm4, %xmm5
3479; SSE-NEXT:    por %xmm5, %xmm6
3480; SSE-NEXT:    movdqa %xmm8, %xmm5
3481; SSE-NEXT:    pandn %xmm6, %xmm5
3482; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,5,6,6,7]
3483; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2]
3484; SSE-NEXT:    movdqa %xmm2, %xmm7
3485; SSE-NEXT:    pandn %xmm6, %xmm7
3486; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
3487; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3488; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,1,2,1]
3489; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[1,1,2,3,4,5,6,7]
3490; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,4]
3491; SSE-NEXT:    pand %xmm2, %xmm6
3492; SSE-NEXT:    por %xmm7, %xmm6
3493; SSE-NEXT:    pand %xmm8, %xmm6
3494; SSE-NEXT:    por %xmm5, %xmm6
3495; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm9[2,2,2,2]
3496; SSE-NEXT:    movdqa %xmm12, %xmm0
3497; SSE-NEXT:    pandn %xmm5, %xmm0
3498; SSE-NEXT:    pand %xmm12, %xmm6
3499; SSE-NEXT:    por %xmm6, %xmm0
3500; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3501; SSE-NEXT:    pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
3502; SSE-NEXT:    # xmm5 = mem[1,0,2,3,4,5,6,7]
3503; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
3504; SSE-NEXT:    movdqa %xmm4, %xmm6
3505; SSE-NEXT:    pandn %xmm5, %xmm6
3506; SSE-NEXT:    movdqa (%rsp), %xmm3 # 16-byte Reload
3507; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm3[1,0,2,3,4,5,6,7]
3508; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
3509; SSE-NEXT:    pand %xmm4, %xmm5
3510; SSE-NEXT:    por %xmm5, %xmm6
3511; SSE-NEXT:    movdqa %xmm8, %xmm5
3512; SSE-NEXT:    pandn %xmm6, %xmm5
3513; SSE-NEXT:    pshuflw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
3514; SSE-NEXT:    # xmm6 = mem[0,1,2,2,4,5,6,7]
3515; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,0,1,3]
3516; SSE-NEXT:    movdqa %xmm12, %xmm7
3517; SSE-NEXT:    pandn %xmm6, %xmm7
3518; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3519; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm9[0,0,0,0]
3520; SSE-NEXT:    pand %xmm12, %xmm6
3521; SSE-NEXT:    por %xmm6, %xmm7
3522; SSE-NEXT:    pand %xmm8, %xmm7
3523; SSE-NEXT:    por %xmm5, %xmm7
3524; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3525; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0]
3526; SSE-NEXT:    movdqa %xmm2, %xmm1
3527; SSE-NEXT:    pandn %xmm5, %xmm1
3528; SSE-NEXT:    pand %xmm2, %xmm7
3529; SSE-NEXT:    por %xmm7, %xmm1
3530; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3531; SSE-NEXT:    pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
3532; SSE-NEXT:    # xmm5 = mem[0,1,2,3,5,7,6,7]
3533; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,3,2]
3534; SSE-NEXT:    movdqa %xmm2, %xmm6
3535; SSE-NEXT:    pandn %xmm5, %xmm6
3536; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,7,6,6,7]
3537; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2]
3538; SSE-NEXT:    pand %xmm2, %xmm5
3539; SSE-NEXT:    por %xmm5, %xmm6
3540; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255]
3541; SSE-NEXT:    movdqa %xmm1, %xmm5
3542; SSE-NEXT:    pandn %xmm6, %xmm5
3543; SSE-NEXT:    pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
3544; SSE-NEXT:    # xmm6 = mem[0,1,2,3,7,5,6,6]
3545; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,2,2]
3546; SSE-NEXT:    movdqa %xmm15, %xmm7
3547; SSE-NEXT:    pandn %xmm6, %xmm7
3548; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3]
3549; SSE-NEXT:    pand %xmm15, %xmm6
3550; SSE-NEXT:    por %xmm6, %xmm7
3551; SSE-NEXT:    pand %xmm1, %xmm7
3552; SSE-NEXT:    por %xmm5, %xmm7
3553; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3]
3554; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
3555; SSE-NEXT:    movdqa %xmm9, %xmm13
3556; SSE-NEXT:    pandn %xmm5, %xmm13
3557; SSE-NEXT:    pand %xmm9, %xmm7
3558; SSE-NEXT:    por %xmm7, %xmm13
3559; SSE-NEXT:    pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
3560; SSE-NEXT:    # xmm5 = mem[1,0,2,3,4,5,6,7]
3561; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
3562; SSE-NEXT:    movdqa %xmm4, %xmm6
3563; SSE-NEXT:    pandn %xmm5, %xmm6
3564; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
3565; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm11[1,0,2,3,4,5,6,7]
3566; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
3567; SSE-NEXT:    pand %xmm4, %xmm5
3568; SSE-NEXT:    por %xmm5, %xmm6
3569; SSE-NEXT:    movdqa %xmm8, %xmm5
3570; SSE-NEXT:    pandn %xmm6, %xmm5
3571; SSE-NEXT:    pshuflw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
3572; SSE-NEXT:    # xmm6 = mem[0,1,2,2,4,5,6,7]
3573; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,0,1,3]
3574; SSE-NEXT:    movdqa %xmm12, %xmm3
3575; SSE-NEXT:    pandn %xmm6, %xmm3
3576; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3577; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,0,0]
3578; SSE-NEXT:    pand %xmm12, %xmm6
3579; SSE-NEXT:    por %xmm6, %xmm3
3580; SSE-NEXT:    pand %xmm8, %xmm3
3581; SSE-NEXT:    por %xmm5, %xmm3
3582; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3583; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0]
3584; SSE-NEXT:    movdqa %xmm2, %xmm10
3585; SSE-NEXT:    pandn %xmm5, %xmm10
3586; SSE-NEXT:    pand %xmm2, %xmm3
3587; SSE-NEXT:    por %xmm3, %xmm10
3588; SSE-NEXT:    pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3589; SSE-NEXT:    # xmm3 = mem[0,1,2,3,5,7,6,7]
3590; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,3,2]
3591; SSE-NEXT:    movdqa %xmm2, %xmm5
3592; SSE-NEXT:    pandn %xmm3, %xmm5
3593; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,7,6,6,7]
3594; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
3595; SSE-NEXT:    pand %xmm2, %xmm3
3596; SSE-NEXT:    por %xmm3, %xmm5
3597; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255]
3598; SSE-NEXT:    movdqa %xmm7, %xmm3
3599; SSE-NEXT:    pandn %xmm5, %xmm3
3600; SSE-NEXT:    pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
3601; SSE-NEXT:    # xmm5 = mem[0,1,2,3,7,5,6,6]
3602; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,2,2]
3603; SSE-NEXT:    movdqa %xmm15, %xmm6
3604; SSE-NEXT:    pandn %xmm5, %xmm6
3605; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[3,3,3,3]
3606; SSE-NEXT:    pand %xmm15, %xmm5
3607; SSE-NEXT:    por %xmm5, %xmm6
3608; SSE-NEXT:    pand %xmm7, %xmm6
3609; SSE-NEXT:    por %xmm3, %xmm6
3610; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
3611; SSE-NEXT:    movdqa %xmm9, %xmm7
3612; SSE-NEXT:    pandn %xmm3, %xmm7
3613; SSE-NEXT:    pand %xmm9, %xmm6
3614; SSE-NEXT:    por %xmm6, %xmm7
3615; SSE-NEXT:    pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3616; SSE-NEXT:    # xmm3 = mem[1,0,2,3,4,5,6,7]
3617; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
3618; SSE-NEXT:    movdqa %xmm4, %xmm6
3619; SSE-NEXT:    pandn %xmm3, %xmm6
3620; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3621; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm5[1,0,2,3,4,5,6,7]
3622; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
3623; SSE-NEXT:    pand %xmm4, %xmm3
3624; SSE-NEXT:    por %xmm3, %xmm6
3625; SSE-NEXT:    movdqa %xmm8, %xmm3
3626; SSE-NEXT:    pandn %xmm6, %xmm3
3627; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm14[0,1,2,2,4,5,6,7]
3628; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,0,1,3]
3629; SSE-NEXT:    movdqa %xmm12, %xmm11
3630; SSE-NEXT:    pandn %xmm6, %xmm11
3631; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3632; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,0,0,0]
3633; SSE-NEXT:    pand %xmm12, %xmm6
3634; SSE-NEXT:    por %xmm6, %xmm11
3635; SSE-NEXT:    pand %xmm8, %xmm11
3636; SSE-NEXT:    por %xmm3, %xmm11
3637; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3638; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
3639; SSE-NEXT:    movdqa %xmm2, %xmm14
3640; SSE-NEXT:    pandn %xmm3, %xmm14
3641; SSE-NEXT:    pand %xmm2, %xmm11
3642; SSE-NEXT:    por %xmm11, %xmm14
3643; SSE-NEXT:    pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3644; SSE-NEXT:    # xmm3 = mem[0,1,2,3,5,7,6,7]
3645; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,3,2]
3646; SSE-NEXT:    movdqa %xmm2, %xmm6
3647; SSE-NEXT:    pandn %xmm3, %xmm6
3648; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,7,6,6,7]
3649; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
3650; SSE-NEXT:    pand %xmm2, %xmm3
3651; SSE-NEXT:    por %xmm3, %xmm6
3652; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255]
3653; SSE-NEXT:    movdqa %xmm5, %xmm3
3654; SSE-NEXT:    pandn %xmm6, %xmm3
3655; SSE-NEXT:    pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
3656; SSE-NEXT:    # xmm6 = mem[0,1,2,3,7,5,6,6]
3657; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,2,2]
3658; SSE-NEXT:    movdqa %xmm15, %xmm11
3659; SSE-NEXT:    pandn %xmm6, %xmm11
3660; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3]
3661; SSE-NEXT:    pand %xmm15, %xmm6
3662; SSE-NEXT:    por %xmm6, %xmm11
3663; SSE-NEXT:    pand %xmm5, %xmm11
3664; SSE-NEXT:    por %xmm3, %xmm11
3665; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
3666; SSE-NEXT:    movdqa %xmm9, %xmm6
3667; SSE-NEXT:    pandn %xmm3, %xmm6
3668; SSE-NEXT:    pand %xmm9, %xmm11
3669; SSE-NEXT:    por %xmm11, %xmm6
3670; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3671; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[1,0,2,3,4,5,6,7]
3672; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
3673; SSE-NEXT:    pand %xmm4, %xmm3
3674; SSE-NEXT:    pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
3675; SSE-NEXT:    # xmm11 = mem[1,0,2,3,4,5,6,7]
3676; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[0,1,0,1]
3677; SSE-NEXT:    pandn %xmm11, %xmm4
3678; SSE-NEXT:    por %xmm3, %xmm4
3679; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3680; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[0,0,0,0]
3681; SSE-NEXT:    pand %xmm12, %xmm3
3682; SSE-NEXT:    pshuflw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
3683; SSE-NEXT:    # xmm11 = mem[0,1,2,2,4,5,6,7]
3684; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[0,0,1,3]
3685; SSE-NEXT:    pandn %xmm11, %xmm12
3686; SSE-NEXT:    por %xmm3, %xmm12
3687; SSE-NEXT:    pand %xmm8, %xmm12
3688; SSE-NEXT:    pandn %xmm4, %xmm8
3689; SSE-NEXT:    por %xmm12, %xmm8
3690; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3691; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0]
3692; SSE-NEXT:    movdqa %xmm2, %xmm0
3693; SSE-NEXT:    pandn %xmm3, %xmm0
3694; SSE-NEXT:    pand %xmm2, %xmm8
3695; SSE-NEXT:    por %xmm8, %xmm0
3696; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7]
3697; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
3698; SSE-NEXT:    pand %xmm2, %xmm1
3699; SSE-NEXT:    pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3700; SSE-NEXT:    # xmm3 = mem[0,1,2,3,5,7,6,7]
3701; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,3,2]
3702; SSE-NEXT:    pandn %xmm3, %xmm2
3703; SSE-NEXT:    por %xmm1, %xmm2
3704; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3]
3705; SSE-NEXT:    pand %xmm15, %xmm1
3706; SSE-NEXT:    pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3707; SSE-NEXT:    # xmm3 = mem[0,1,2,3,7,5,6,6]
3708; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,2,2]
3709; SSE-NEXT:    pandn %xmm3, %xmm15
3710; SSE-NEXT:    por %xmm1, %xmm15
3711; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255]
3712; SSE-NEXT:    pand %xmm1, %xmm15
3713; SSE-NEXT:    pandn %xmm2, %xmm1
3714; SSE-NEXT:    por %xmm15, %xmm1
3715; SSE-NEXT:    pand %xmm9, %xmm1
3716; SSE-NEXT:    movdqa %xmm1, %xmm2
3717; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3]
3718; SSE-NEXT:    pandn %xmm1, %xmm9
3719; SSE-NEXT:    por %xmm2, %xmm9
3720; SSE-NEXT:    movdqa %xmm9, 304(%r9)
3721; SSE-NEXT:    movdqa %xmm0, 240(%r9)
3722; SSE-NEXT:    movdqa %xmm6, 224(%r9)
3723; SSE-NEXT:    movdqa %xmm14, 160(%r9)
3724; SSE-NEXT:    movdqa %xmm7, 144(%r9)
3725; SSE-NEXT:    movdqa %xmm10, 80(%r9)
3726; SSE-NEXT:    movdqa %xmm13, 64(%r9)
3727; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3728; SSE-NEXT:    movaps %xmm0, (%r9)
3729; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3730; SSE-NEXT:    movaps %xmm0, 288(%r9)
3731; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3732; SSE-NEXT:    movaps %xmm0, 256(%r9)
3733; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3734; SSE-NEXT:    movaps %xmm0, 208(%r9)
3735; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3736; SSE-NEXT:    movaps %xmm0, 176(%r9)
3737; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3738; SSE-NEXT:    movaps %xmm0, 128(%r9)
3739; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3740; SSE-NEXT:    movaps %xmm0, 96(%r9)
3741; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3742; SSE-NEXT:    movaps %xmm0, 48(%r9)
3743; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3744; SSE-NEXT:    movaps %xmm0, 16(%r9)
3745; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3746; SSE-NEXT:    movaps %xmm0, 272(%r9)
3747; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3748; SSE-NEXT:    movaps %xmm0, 192(%r9)
3749; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3750; SSE-NEXT:    movaps %xmm0, 112(%r9)
3751; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3752; SSE-NEXT:    movaps %xmm0, 32(%r9)
3753; SSE-NEXT:    addq $504, %rsp # imm = 0x1F8
3754; SSE-NEXT:    retq
3755;
3756; AVX-LABEL: store_i8_stride5_vf64:
3757; AVX:       # %bb.0:
3758; AVX-NEXT:    subq $104, %rsp
3759; AVX-NEXT:    vmovdqa 48(%rcx), %xmm0
3760; AVX-NEXT:    vmovddup {{.*#+}} xmm14 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128]
3761; AVX-NEXT:    # xmm14 = mem[0,0]
3762; AVX-NEXT:    vpshufb %xmm14, %xmm0, %xmm2
3763; AVX-NEXT:    vmovdqa 48(%rdx), %xmm1
3764; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[6],zero,xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9]
3765; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm3
3766; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3767; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm9[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u]
3768; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
3769; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
3770; AVX-NEXT:    vandnps %ymm3, %ymm2, %ymm4
3771; AVX-NEXT:    vmovdqa 48(%rsi), %xmm3
3772; AVX-NEXT:    vmovddup {{.*#+}} xmm10 = [128,8,0,128,7,128,9,0,128,8,0,128,7,128,9,0]
3773; AVX-NEXT:    # xmm10 = mem[0,0]
3774; AVX-NEXT:    vpshufb %xmm10, %xmm3, %xmm6
3775; AVX-NEXT:    vmovdqa 48(%rdi), %xmm5
3776; AVX-NEXT:    vmovddup {{.*#+}} xmm12 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0]
3777; AVX-NEXT:    # xmm12 = mem[0,0]
3778; AVX-NEXT:    vpshufb %xmm12, %xmm5, %xmm7
3779; AVX-NEXT:    vpor %xmm6, %xmm7, %xmm6
3780; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
3781; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12]
3782; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm6
3783; AVX-NEXT:    vandps %ymm2, %ymm6, %ymm6
3784; AVX-NEXT:    vorps %ymm4, %ymm6, %ymm4
3785; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm6
3786; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[0,1],zero,xmm6[3,4,5,6],zero,xmm6[8,9,10,11],zero,xmm6[13,14,15]
3787; AVX-NEXT:    vmovdqa 48(%r8), %xmm7
3788; AVX-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm7[6],zero,zero,zero,zero,xmm7[7],zero,zero,zero,zero,xmm7[8],zero,zero,zero
3789; AVX-NEXT:    vpor %xmm6, %xmm8, %xmm6
3790; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3791; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[4,5,6,7],zero,xmm4[9,10,11,12],zero,xmm4[14,15]
3792; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm7[3],zero,zero,zero,zero,xmm7[4],zero,zero,zero,zero,xmm7[5],zero,zero
3793; AVX-NEXT:    vpor %xmm6, %xmm4, %xmm4
3794; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3795; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
3796; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm4[u,10,11,u,u,u,12,13,u,u,u,14,15,u,u,u]
3797; AVX-NEXT:    vmovddup {{.*#+}} xmm8 = [7,0,4,5,8,9,0,6,7,0,4,5,8,9,0,6]
3798; AVX-NEXT:    # xmm8 = mem[0,0]
3799; AVX-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
3800; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm4, %ymm4
3801; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
3802; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
3803; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u]
3804; AVX-NEXT:    vmovddup {{.*#+}} xmm15 = [2,7,6,0,5,4,9,8,2,7,6,0,5,4,9,8]
3805; AVX-NEXT:    # xmm15 = mem[0,0]
3806; AVX-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
3807; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3808; AVX-NEXT:    vmovaps {{.*#+}} ymm11 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
3809; AVX-NEXT:    vandnps %ymm4, %ymm11, %ymm1
3810; AVX-NEXT:    vandps %ymm0, %ymm11, %ymm0
3811; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
3812; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
3813; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero
3814; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm7[12],zero,zero,zero,zero,xmm7[13],zero,zero,zero,zero,xmm7[14],zero,zero,zero,zero,xmm7[15]
3815; AVX-NEXT:    vpor %xmm4, %xmm1, %xmm1
3816; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3817; AVX-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,128,2,3,4,5,128,7,8,9,10,128,12,13,14,15]
3818; AVX-NEXT:    vpshufb %xmm13, %xmm0, %xmm0
3819; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,9,128,128,128,128,10,128,128,128,128,11,128,128,128,128]
3820; AVX-NEXT:    vpshufb %xmm2, %xmm7, %xmm1
3821; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
3822; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3823; AVX-NEXT:    vmovdqa 32(%rsi), %xmm0
3824; AVX-NEXT:    vmovdqa 32(%rdi), %xmm4
3825; AVX-NEXT:    vpshufb %xmm10, %xmm0, %xmm1
3826; AVX-NEXT:    vpshufb %xmm12, %xmm4, %xmm6
3827; AVX-NEXT:    vpor %xmm1, %xmm6, %xmm1
3828; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
3829; AVX-NEXT:    vpshufb %xmm8, %xmm12, %xmm6
3830; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm6
3831; AVX-NEXT:    vmovdqa 32(%rcx), %xmm1
3832; AVX-NEXT:    vmovdqa 32(%rdx), %xmm8
3833; AVX-NEXT:    vpshufb %xmm14, %xmm1, %xmm10
3834; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = xmm8[6],zero,xmm8[u,u,u,7],zero,xmm8[u,u,u,8],zero,xmm8[u,u,u,9]
3835; AVX-NEXT:    vpor %xmm10, %xmm14, %xmm10
3836; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
3837; AVX-NEXT:    vpshufb %xmm15, %xmm14, %xmm14
3838; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm10, %ymm14
3839; AVX-NEXT:    vmovaps {{.*#+}} ymm10 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255]
3840; AVX-NEXT:    vandnps %ymm6, %ymm10, %ymm6
3841; AVX-NEXT:    vandps %ymm10, %ymm14, %ymm14
3842; AVX-NEXT:    vorps %ymm6, %ymm14, %ymm14
3843; AVX-NEXT:    vextractf128 $1, %ymm14, %xmm6
3844; AVX-NEXT:    vpshufb %xmm13, %xmm6, %xmm15
3845; AVX-NEXT:    vmovdqa 32(%r8), %xmm6
3846; AVX-NEXT:    vpshufb %xmm2, %xmm6, %xmm13
3847; AVX-NEXT:    vpor %xmm13, %xmm15, %xmm2
3848; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3849; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm14[0,1],zero,xmm14[3,4,5,6],zero,xmm14[8,9,10,11],zero,xmm14[13,14,15]
3850; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm6[6],zero,zero,zero,zero,xmm6[7],zero,zero,zero,zero,xmm6[8],zero,zero,zero
3851; AVX-NEXT:    vpor %xmm14, %xmm13, %xmm2
3852; AVX-NEXT:    vmovdqa %xmm2, (%rsp) # 16-byte Spill
3853; AVX-NEXT:    vmovddup {{.*#+}} xmm14 = [3,0,0,1,4,5,0,2,3,0,0,1,4,5,0,2]
3854; AVX-NEXT:    # xmm14 = mem[0,0]
3855; AVX-NEXT:    vpshufb %xmm14, %xmm9, %xmm2
3856; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15]
3857; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u]
3858; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm13, %ymm2
3859; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
3860; AVX-NEXT:    vmovddup {{.*#+}} xmm13 = [0,1,4,5,0,2,3,6,0,1,4,5,0,2,3,6]
3861; AVX-NEXT:    # xmm13 = mem[0,0]
3862; AVX-NEXT:    vpshufb %xmm13, %xmm3, %xmm3
3863; AVX-NEXT:    vmovddup {{.*#+}} xmm15 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13]
3864; AVX-NEXT:    # xmm15 = mem[0,0]
3865; AVX-NEXT:    vpshufb %xmm15, %xmm12, %xmm5
3866; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
3867; AVX-NEXT:    vandnps %ymm2, %ymm10, %ymm2
3868; AVX-NEXT:    vandps %ymm3, %ymm10, %ymm3
3869; AVX-NEXT:    vorps %ymm2, %ymm3, %ymm2
3870; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm3
3871; AVX-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,128,5,6,7,8,128,10,11,12,13,128,15]
3872; AVX-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
3873; AVX-NEXT:    vmovdqa {{.*#+}} xmm10 = [128,128,128,128,0,128,128,128,128,1,128,128,128,128,2,128]
3874; AVX-NEXT:    vpshufb %xmm10, %xmm7, %xmm5
3875; AVX-NEXT:    vmovdqa %xmm10, %xmm7
3876; AVX-NEXT:    vpor %xmm5, %xmm3, %xmm3
3877; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3878; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,1,2,3,4,128,6,7,8,9,128,11,12,13,14,128]
3879; AVX-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
3880; AVX-NEXT:    vmovdqa {{.*#+}} xmm10 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15]
3881; AVX-NEXT:    vpshufb %xmm10, %xmm6, %xmm3
3882; AVX-NEXT:    vpor %xmm3, %xmm2, %xmm2
3883; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3884; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
3885; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u]
3886; AVX-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
3887; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
3888; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
3889; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
3890; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12]
3891; AVX-NEXT:    vpshufb %xmm13, %xmm0, %xmm0
3892; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
3893; AVX-NEXT:    vandnps %ymm1, %ymm11, %ymm1
3894; AVX-NEXT:    vandps %ymm0, %ymm11, %ymm0
3895; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
3896; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
3897; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[4,5,6,7],zero,xmm1[9,10,11,12],zero,xmm1[14,15]
3898; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[3],zero,zero,zero,zero,xmm6[4],zero,zero,zero,zero,xmm6[5],zero,zero
3899; AVX-NEXT:    vpor %xmm2, %xmm1, %xmm1
3900; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3901; AVX-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
3902; AVX-NEXT:    vpshufb %xmm7, %xmm6, %xmm1
3903; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
3904; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3905; AVX-NEXT:    vmovdqa 16(%rsi), %xmm8
3906; AVX-NEXT:    vmovdqa 16(%rdi), %xmm6
3907; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15]
3908; AVX-NEXT:    vpshufb %xmm15, %xmm0, %xmm1
3909; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u]
3910; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3911; AVX-NEXT:    vmovdqa 16(%rcx), %xmm1
3912; AVX-NEXT:    vmovdqa 16(%rdx), %xmm2
3913; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
3914; AVX-NEXT:    vmovddup {{.*#+}} xmm12 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0]
3915; AVX-NEXT:    # xmm12 = mem[0,0]
3916; AVX-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
3917; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
3918; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8]
3919; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
3920; AVX-NEXT:    vandnps %ymm0, %ymm11, %ymm0
3921; AVX-NEXT:    vandps %ymm3, %ymm11, %ymm3
3922; AVX-NEXT:    vorps %ymm0, %ymm3, %ymm3
3923; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm0
3924; AVX-NEXT:    vpshufb %xmm5, %xmm0, %xmm4
3925; AVX-NEXT:    vmovdqa %xmm5, %xmm11
3926; AVX-NEXT:    vmovdqa 16(%r8), %xmm0
3927; AVX-NEXT:    vpshufb %xmm10, %xmm0, %xmm7
3928; AVX-NEXT:    vpor %xmm7, %xmm4, %xmm4
3929; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3930; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0],zero,xmm3[2,3,4,5],zero,xmm3[7,8,9,10],zero,xmm3[12,13,14,15]
3931; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero
3932; AVX-NEXT:    vpor %xmm4, %xmm3, %xmm3
3933; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3934; AVX-NEXT:    vmovdqa (%rcx), %xmm9
3935; AVX-NEXT:    vmovdqa (%rdx), %xmm7
3936; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15]
3937; AVX-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
3938; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
3939; AVX-NEXT:    vpshufb %xmm14, %xmm4, %xmm5
3940; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm12
3941; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
3942; AVX-NEXT:    vpshufb %xmm13, %xmm3, %xmm13
3943; AVX-NEXT:    vmovdqa (%rsi), %xmm5
3944; AVX-NEXT:    vmovdqa (%rdi), %xmm3
3945; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
3946; AVX-NEXT:    vpshufb %xmm15, %xmm10, %xmm14
3947; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm14, %ymm13
3948; AVX-NEXT:    vmovaps {{.*#+}} ymm14 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255]
3949; AVX-NEXT:    vandnps %ymm12, %ymm14, %ymm12
3950; AVX-NEXT:    vandps %ymm14, %ymm13, %ymm13
3951; AVX-NEXT:    vorps %ymm12, %ymm13, %ymm12
3952; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm13
3953; AVX-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3],zero,xmm13[5,6,7,8],zero,xmm13[10,11,12,13],zero,xmm13[15]
3954; AVX-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero
3955; AVX-NEXT:    vpor %xmm14, %xmm13, %xmm13
3956; AVX-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3957; AVX-NEXT:    vpshufb %xmm11, %xmm12, %xmm12
3958; AVX-NEXT:    vmovdqa (%r8), %xmm13
3959; AVX-NEXT:    vpshufb {{.*#+}} xmm15 = xmm13[12],zero,zero,zero,zero,xmm13[13],zero,zero,zero,zero,xmm13[14],zero,zero,zero,zero,xmm13[15]
3960; AVX-NEXT:    vpor %xmm15, %xmm12, %xmm11
3961; AVX-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3962; AVX-NEXT:    vmovddup {{.*#+}} xmm11 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128]
3963; AVX-NEXT:    # xmm11 = mem[0,0]
3964; AVX-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
3965; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[6],zero,xmm2[u,u,u,7],zero,xmm2[u,u,u,8],zero,xmm2[u,u,u,9]
3966; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
3967; AVX-NEXT:    vmovddup {{.*#+}} xmm11 = [0,6,7,10,11,0,8,9,0,6,7,10,11,0,8,9]
3968; AVX-NEXT:    # xmm11 = mem[0,0]
3969; AVX-NEXT:    vpshufb %xmm11, %xmm4, %xmm2
3970; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
3971; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,xmm8[7,u,u,u],zero,xmm8[8,u,u,u],zero,xmm8[9,u]
3972; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,7],zero,xmm6[u,u,u,8],zero,xmm6[u,u,u,9],zero,xmm6[u]
3973; AVX-NEXT:    vpor %xmm2, %xmm4, %xmm2
3974; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
3975; AVX-NEXT:    vmovddup {{.*#+}} xmm6 = [6,11,10,0,9,8,13,12,6,11,10,0,9,8,13,12]
3976; AVX-NEXT:    # xmm6 = mem[0,0]
3977; AVX-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
3978; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm4, %ymm2
3979; AVX-NEXT:    vmovaps {{.*#+}} ymm4 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
3980; AVX-NEXT:    vandnps %ymm1, %ymm4, %ymm1
3981; AVX-NEXT:    vandps %ymm4, %ymm2, %ymm2
3982; AVX-NEXT:    vorps %ymm1, %ymm2, %ymm2
3983; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
3984; AVX-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,128,3,4,5,6,128,8,9,10,11,128,13,14,15]
3985; AVX-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
3986; AVX-NEXT:    vmovdqa {{.*#+}} xmm15 = [128,128,6,128,128,128,128,7,128,128,128,128,8,128,128,128]
3987; AVX-NEXT:    vpshufb %xmm15, %xmm0, %xmm4
3988; AVX-NEXT:    vpor %xmm4, %xmm1, %xmm12
3989; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,128,4,5,6,7,128,9,10,11,12,128,14,15]
3990; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
3991; AVX-NEXT:    vmovdqa {{.*#+}} xmm8 = [128,128,128,3,128,128,128,128,4,128,128,128,128,5,128,128]
3992; AVX-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
3993; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
3994; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
3995; AVX-NEXT:    vpshufb %xmm11, %xmm2, %xmm4
3996; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u]
3997; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
3998; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
3999; AVX-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
4000; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
4001; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6]
4002; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
4003; AVX-NEXT:    vmovaps {{.*#+}} ymm6 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
4004; AVX-NEXT:    vandnps %ymm2, %ymm6, %ymm2
4005; AVX-NEXT:    vandps %ymm6, %ymm4, %ymm4
4006; AVX-NEXT:    vorps %ymm2, %ymm4, %ymm4
4007; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm2
4008; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
4009; AVX-NEXT:    vpshufb %xmm8, %xmm13, %xmm6
4010; AVX-NEXT:    vpor %xmm6, %xmm2, %xmm2
4011; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3],zero,xmm4[5,6,7,8],zero,xmm4[10,11,12,13],zero,xmm4[15]
4012; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm13[0],zero,zero,zero,zero,xmm13[1],zero,zero,zero,zero,xmm13[2],zero
4013; AVX-NEXT:    vpor %xmm6, %xmm4, %xmm4
4014; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,xmm5[7,u,u,u],zero,xmm5[8,u,u,u],zero,xmm5[9,u]
4015; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,7],zero,xmm3[u,u,u,8],zero,xmm3[u,u,u,9],zero,xmm3[u]
4016; AVX-NEXT:    vpor %xmm5, %xmm3, %xmm3
4017; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm10[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u]
4018; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
4019; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = zero,xmm9[6,u,u,u],zero,xmm9[7,u,u,u],zero,xmm9[8,u,u,u],zero
4020; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm7[6],zero,xmm7[u,u,u,7],zero,xmm7[u,u,u,8],zero,xmm7[u,u,u,9]
4021; AVX-NEXT:    vpor %xmm5, %xmm6, %xmm5
4022; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15]
4023; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8]
4024; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
4025; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255]
4026; AVX-NEXT:    vandnps %ymm3, %ymm1, %ymm3
4027; AVX-NEXT:    vandps %ymm1, %ymm5, %ymm5
4028; AVX-NEXT:    vorps %ymm3, %ymm5, %ymm3
4029; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm5
4030; AVX-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0],zero,xmm5[2,3,4,5],zero,xmm5[7,8,9,10],zero,xmm5[12,13,14,15]
4031; AVX-NEXT:    vpshufb {{.*#+}} xmm6 = zero,xmm13[9],zero,zero,zero,zero,xmm13[10],zero,zero,zero,zero,xmm13[11],zero,zero,zero,zero
4032; AVX-NEXT:    vpor %xmm6, %xmm5, %xmm5
4033; AVX-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
4034; AVX-NEXT:    vpshufb %xmm15, %xmm13, %xmm6
4035; AVX-NEXT:    vpor %xmm6, %xmm3, %xmm3
4036; AVX-NEXT:    vmovdqa %xmm3, 32(%r9)
4037; AVX-NEXT:    vmovdqa %xmm5, 48(%r9)
4038; AVX-NEXT:    vmovdqa %xmm4, (%r9)
4039; AVX-NEXT:    vmovdqa %xmm2, 16(%r9)
4040; AVX-NEXT:    vmovdqa %xmm0, 96(%r9)
4041; AVX-NEXT:    vmovdqa %xmm12, 112(%r9)
4042; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4043; AVX-NEXT:    vmovaps %xmm0, 64(%r9)
4044; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4045; AVX-NEXT:    vmovaps %xmm0, 80(%r9)
4046; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4047; AVX-NEXT:    vmovaps %xmm0, 128(%r9)
4048; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4049; AVX-NEXT:    vmovaps %xmm0, 144(%r9)
4050; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4051; AVX-NEXT:    vmovaps %xmm0, 160(%r9)
4052; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4053; AVX-NEXT:    vmovaps %xmm0, 176(%r9)
4054; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4055; AVX-NEXT:    vmovaps %xmm0, 224(%r9)
4056; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4057; AVX-NEXT:    vmovaps %xmm0, 240(%r9)
4058; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
4059; AVX-NEXT:    vmovaps %xmm0, 192(%r9)
4060; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4061; AVX-NEXT:    vmovaps %xmm0, 208(%r9)
4062; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4063; AVX-NEXT:    vmovaps %xmm0, 288(%r9)
4064; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4065; AVX-NEXT:    vmovaps %xmm0, 304(%r9)
4066; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4067; AVX-NEXT:    vmovaps %xmm0, 256(%r9)
4068; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4069; AVX-NEXT:    vmovaps %xmm0, 272(%r9)
4070; AVX-NEXT:    addq $104, %rsp
4071; AVX-NEXT:    vzeroupper
4072; AVX-NEXT:    retq
4073;
4074; AVX2-LABEL: store_i8_stride5_vf64:
4075; AVX2:       # %bb.0:
4076; AVX2-NEXT:    subq $248, %rsp
4077; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm13
4078; AVX2-NEXT:    vmovdqa (%rcx), %xmm1
4079; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4080; AVX2-NEXT:    vmovdqa 32(%rcx), %xmm7
4081; AVX2-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4082; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12]
4083; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
4084; AVX2-NEXT:    vmovdqa (%rdx), %xmm3
4085; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4086; AVX2-NEXT:    vmovdqa 32(%rdx), %xmm10
4087; AVX2-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4088; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
4089; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
4090; AVX2-NEXT:    vpor %xmm1, %xmm3, %xmm1
4091; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
4092; AVX2-NEXT:    vmovdqa (%rdi), %xmm5
4093; AVX2-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4094; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
4095; AVX2-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
4096; AVX2-NEXT:    vmovdqa (%rsi), %xmm6
4097; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
4098; AVX2-NEXT:    vpshufb %xmm8, %xmm6, %xmm9
4099; AVX2-NEXT:    vpor %xmm5, %xmm9, %xmm5
4100; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
4101; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
4102; AVX2-NEXT:    vpblendvb %ymm9, %ymm1, %ymm5, %ymm1
4103; AVX2-NEXT:    vmovdqa (%r8), %xmm5
4104; AVX2-NEXT:    vmovdqa %xmm5, (%rsp) # 16-byte Spill
4105; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
4106; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,1,1]
4107; AVX2-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
4108; AVX2-NEXT:    vpblendvb %ymm12, %ymm1, %ymm5, %ymm1
4109; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4110; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm5
4111; AVX2-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4112; AVX2-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
4113; AVX2-NEXT:    vpshufb %xmm2, %xmm10, %xmm1
4114; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
4115; AVX2-NEXT:    vmovdqa 32(%rsi), %xmm2
4116; AVX2-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4117; AVX2-NEXT:    vpshufb %xmm3, %xmm5, %xmm1
4118; AVX2-NEXT:    vpshufb %xmm8, %xmm2, %xmm2
4119; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
4120; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm11
4121; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
4122; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
4123; AVX2-NEXT:    vpblendvb %ymm9, %ymm0, %ymm1, %ymm0
4124; AVX2-NEXT:    vmovdqa 32(%r8), %xmm1
4125; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4126; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
4127; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1]
4128; AVX2-NEXT:    vpblendvb %ymm12, %ymm0, %ymm1, %ymm0
4129; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4130; AVX2-NEXT:    vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23]
4131; AVX2-NEXT:    vpshufb %ymm15, %ymm13, %ymm1
4132; AVX2-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4133; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128]
4134; AVX2-NEXT:    # ymm4 = mem[0,1,0,1]
4135; AVX2-NEXT:    vpshufb %ymm4, %ymm11, %ymm3
4136; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
4137; AVX2-NEXT:    vmovdqa 32(%rdx), %ymm12
4138; AVX2-NEXT:    vmovdqa 32(%rcx), %ymm14
4139; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
4140; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
4141; AVX2-NEXT:    vpshufb %ymm3, %ymm14, %ymm8
4142; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25]
4143; AVX2-NEXT:    vpshufb %ymm5, %ymm12, %ymm10
4144; AVX2-NEXT:    vpor %ymm8, %ymm10, %ymm8
4145; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
4146; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
4147; AVX2-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0]
4148; AVX2-NEXT:    vpblendvb %ymm10, %ymm1, %ymm8, %ymm2
4149; AVX2-NEXT:    vmovdqa (%rdi), %ymm9
4150; AVX2-NEXT:    vpshufb %ymm15, %ymm9, %ymm1
4151; AVX2-NEXT:    vmovdqa (%rsi), %ymm15
4152; AVX2-NEXT:    vpshufb %ymm4, %ymm15, %ymm4
4153; AVX2-NEXT:    vpor %ymm1, %ymm4, %ymm4
4154; AVX2-NEXT:    vmovdqa (%rcx), %ymm7
4155; AVX2-NEXT:    vpshufb %ymm3, %ymm7, %ymm0
4156; AVX2-NEXT:    vmovdqa (%rdx), %ymm3
4157; AVX2-NEXT:    vpshufb %ymm5, %ymm3, %ymm5
4158; AVX2-NEXT:    vpor %ymm0, %ymm5, %ymm0
4159; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3]
4160; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
4161; AVX2-NEXT:    vpblendvb %ymm10, %ymm4, %ymm0, %ymm0
4162; AVX2-NEXT:    vmovdqa 32(%r8), %ymm10
4163; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm10[0,2,1,1,4,6,5,5]
4164; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,3,3,2]
4165; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
4166; AVX2-NEXT:    vpblendvb %ymm5, %ymm2, %ymm4, %ymm1
4167; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4168; AVX2-NEXT:    vmovdqa (%r8), %ymm8
4169; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[0,2,1,1,4,6,5,5]
4170; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,3,3,2]
4171; AVX2-NEXT:    vpblendvb %ymm5, %ymm0, %ymm4, %ymm0
4172; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4173; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [3,3,3,0,4,4,4,4]
4174; AVX2-NEXT:    vpermd %ymm13, %ymm2, %ymm4
4175; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
4176; AVX2-NEXT:    vpshufb %ymm5, %ymm11, %ymm0
4177; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255]
4178; AVX2-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
4179; AVX2-NEXT:    vpermd %ymm9, %ymm2, %ymm2
4180; AVX2-NEXT:    vpshufb %ymm5, %ymm15, %ymm4
4181; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm4, %ymm1
4182; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
4183; AVX2-NEXT:    vpshufb %ymm2, %ymm14, %ymm4
4184; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128]
4185; AVX2-NEXT:    vpshufb %ymm5, %ymm12, %ymm13
4186; AVX2-NEXT:    vpor %ymm4, %ymm13, %ymm4
4187; AVX2-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255]
4188; AVX2-NEXT:    vpblendvb %ymm13, %ymm0, %ymm4, %ymm0
4189; AVX2-NEXT:    vpshufb %ymm2, %ymm7, %ymm2
4190; AVX2-NEXT:    vpshufb %ymm5, %ymm3, %ymm4
4191; AVX2-NEXT:    vpor %ymm2, %ymm4, %ymm2
4192; AVX2-NEXT:    vpblendvb %ymm13, %ymm1, %ymm2, %ymm1
4193; AVX2-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [3,3,3,3,0,4,4,4]
4194; AVX2-NEXT:    vpermd %ymm10, %ymm2, %ymm4
4195; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
4196; AVX2-NEXT:    vpblendvb %ymm5, %ymm0, %ymm4, %ymm0
4197; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4198; AVX2-NEXT:    vpermd %ymm8, %ymm2, %ymm0
4199; AVX2-NEXT:    vpblendvb %ymm5, %ymm1, %ymm0, %ymm0
4200; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4201; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4202; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
4203; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4204; AVX2-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4205; AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4206; AVX2-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
4207; AVX2-NEXT:    vpshufb %xmm13, %xmm0, %xmm0
4208; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,0,1,1]
4209; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
4210; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm0
4211; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1]
4212; AVX2-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255]
4213; AVX2-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm4
4214; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4215; AVX2-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4216; AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4217; AVX2-NEXT:    vpshufb %xmm13, %xmm1, %xmm1
4218; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4219; AVX2-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
4220; AVX2-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
4221; AVX2-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
4222; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
4223; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
4224; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
4225; AVX2-NEXT:    vpshufd $80, (%rsp), %xmm1 # 16-byte Folded Reload
4226; AVX2-NEXT:    # xmm1 = mem[0,0,1,1]
4227; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1]
4228; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
4229; AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm1, %ymm4
4230; AVX2-NEXT:    vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4231; AVX2-NEXT:    # xmm1 = mem[0,0,1,1]
4232; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1]
4233; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm6
4234; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12]
4235; AVX2-NEXT:    vpshufb %ymm0, %ymm14, %ymm1
4236; AVX2-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
4237; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7]
4238; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0,255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0]
4239; AVX2-NEXT:    # ymm5 = mem[0,1,0,1]
4240; AVX2-NEXT:    vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
4241; AVX2-NEXT:    vpshufb %ymm0, %ymm7, %ymm0
4242; AVX2-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
4243; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7]
4244; AVX2-NEXT:    vpblendvb %ymm5, %ymm0, %ymm2, %ymm0
4245; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14]
4246; AVX2-NEXT:    vpshufb %ymm2, %ymm11, %ymm3
4247; AVX2-NEXT:    vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
4248; AVX2-NEXT:    # ymm5 = mem[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
4249; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,2,3,3,6,6,7,7]
4250; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255,255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255]
4251; AVX2-NEXT:    # ymm7 = mem[0,1,0,1]
4252; AVX2-NEXT:    vpblendvb %ymm7, %ymm3, %ymm5, %ymm3
4253; AVX2-NEXT:    vpshufb %ymm2, %ymm15, %ymm2
4254; AVX2-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm9[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
4255; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,2,3,3,6,6,7,7]
4256; AVX2-NEXT:    vpblendvb %ymm7, %ymm2, %ymm5, %ymm2
4257; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
4258; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3]
4259; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u]
4260; AVX2-NEXT:    vpblendvb %ymm5, %ymm1, %ymm3, %ymm1
4261; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
4262; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
4263; AVX2-NEXT:    vpblendvb %ymm5, %ymm0, %ymm2, %ymm0
4264; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm10[2,2,3,3,6,6,7,7]
4265; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
4266; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
4267; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
4268; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[2,2,3,3,6,6,7,7]
4269; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
4270; AVX2-NEXT:    vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
4271; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4272; AVX2-NEXT:    vmovaps %ymm2, 64(%r9)
4273; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4274; AVX2-NEXT:    vmovaps %ymm2, 224(%r9)
4275; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4276; AVX2-NEXT:    vmovaps %ymm2, 96(%r9)
4277; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4278; AVX2-NEXT:    vmovaps %ymm2, 256(%r9)
4279; AVX2-NEXT:    vmovdqa %ymm0, 128(%r9)
4280; AVX2-NEXT:    vmovdqa %ymm6, 160(%r9)
4281; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4282; AVX2-NEXT:    vmovaps %ymm0, 192(%r9)
4283; AVX2-NEXT:    vmovdqa %ymm1, 288(%r9)
4284; AVX2-NEXT:    vmovdqa %ymm4, (%r9)
4285; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4286; AVX2-NEXT:    vmovaps %ymm0, 32(%r9)
4287; AVX2-NEXT:    addq $248, %rsp
4288; AVX2-NEXT:    vzeroupper
4289; AVX2-NEXT:    retq
4290;
4291; AVX2-FP-LABEL: store_i8_stride5_vf64:
4292; AVX2-FP:       # %bb.0:
4293; AVX2-FP-NEXT:    subq $200, %rsp
4294; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %ymm12
4295; AVX2-FP-NEXT:    vmovdqa (%rcx), %xmm1
4296; AVX2-FP-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
4297; AVX2-FP-NEXT:    vmovdqa 32(%rcx), %xmm8
4298; AVX2-FP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4299; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12]
4300; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
4301; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm3
4302; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4303; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %xmm9
4304; AVX2-FP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4305; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
4306; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
4307; AVX2-FP-NEXT:    vpor %xmm1, %xmm3, %xmm1
4308; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
4309; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm4
4310; AVX2-FP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4311; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
4312; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
4313; AVX2-FP-NEXT:    vmovdqa (%rsi), %xmm6
4314; AVX2-FP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4315; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
4316; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm6, %xmm6
4317; AVX2-FP-NEXT:    vpor %xmm4, %xmm6, %xmm4
4318; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
4319; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
4320; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm1, %ymm4, %ymm1
4321; AVX2-FP-NEXT:    vmovdqa (%r8), %xmm4
4322; AVX2-FP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4323; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2]
4324; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1]
4325; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
4326; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm1, %ymm4, %ymm1
4327; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4328; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm4
4329; AVX2-FP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4330; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm8, %xmm0
4331; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm9, %xmm1
4332; AVX2-FP-NEXT:    vpor %xmm0, %xmm1, %xmm0
4333; AVX2-FP-NEXT:    vmovdqa 32(%rsi), %xmm2
4334; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4335; AVX2-FP-NEXT:    vpshufb %xmm3, %xmm4, %xmm1
4336; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
4337; AVX2-FP-NEXT:    vpor %xmm1, %xmm2, %xmm1
4338; AVX2-FP-NEXT:    vmovdqa 32(%rcx), %ymm14
4339; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
4340; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
4341; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm0, %ymm1, %ymm0
4342; AVX2-FP-NEXT:    vmovdqa 32(%r8), %xmm1
4343; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4344; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
4345; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1]
4346; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
4347; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4348; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,29,26,128,28,128,30,128,28,29,128,31,128,29]
4349; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm12, %ymm1
4350; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
4351; AVX2-FP-NEXT:    # ymm8 = mem[0,1,0,1]
4352; AVX2-FP-NEXT:    vpshufb %ymm8, %ymm14, %ymm3
4353; AVX2-FP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4354; AVX2-FP-NEXT:    vpor %ymm1, %ymm3, %ymm1
4355; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm4
4356; AVX2-FP-NEXT:    vmovdqa 32(%rsi), %ymm11
4357; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128]
4358; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm4, %ymm6
4359; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30]
4360; AVX2-FP-NEXT:    # ymm9 = mem[0,1,0,1]
4361; AVX2-FP-NEXT:    vpshufb %ymm9, %ymm11, %ymm7
4362; AVX2-FP-NEXT:    vpor %ymm6, %ymm7, %ymm6
4363; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
4364; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3]
4365; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u]
4366; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm1, %ymm6, %ymm3
4367; AVX2-FP-NEXT:    vmovdqa (%rdx), %ymm13
4368; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
4369; AVX2-FP-NEXT:    vmovdqa (%rcx), %ymm7
4370; AVX2-FP-NEXT:    vpshufb %ymm8, %ymm7, %ymm1
4371; AVX2-FP-NEXT:    vpor %ymm0, %ymm1, %ymm8
4372; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm2
4373; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm2, %ymm5
4374; AVX2-FP-NEXT:    vmovdqa (%rsi), %ymm1
4375; AVX2-FP-NEXT:    vpshufb %ymm9, %ymm1, %ymm9
4376; AVX2-FP-NEXT:    vpor %ymm5, %ymm9, %ymm5
4377; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
4378; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3]
4379; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm8, %ymm5, %ymm8
4380; AVX2-FP-NEXT:    vmovdqa 32(%r8), %ymm5
4381; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm5[2,2,3,3,6,6,7,7]
4382; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
4383; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
4384; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm3, %ymm9, %ymm0
4385; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4386; AVX2-FP-NEXT:    vmovdqa (%r8), %ymm3
4387; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm3[2,2,3,3,6,6,7,7]
4388; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
4389; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm0
4390; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4391; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23]
4392; AVX2-FP-NEXT:    vpshufb %ymm8, %ymm4, %ymm9
4393; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128]
4394; AVX2-FP-NEXT:    # ymm10 = mem[0,1,0,1]
4395; AVX2-FP-NEXT:    vpshufb %ymm10, %ymm11, %ymm15
4396; AVX2-FP-NEXT:    vpor %ymm9, %ymm15, %ymm9
4397; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
4398; AVX2-FP-NEXT:    # ymm15 = mem[0,1,0,1]
4399; AVX2-FP-NEXT:    vpshufb %ymm15, %ymm14, %ymm0
4400; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25]
4401; AVX2-FP-NEXT:    vpshufb %ymm6, %ymm12, %ymm14
4402; AVX2-FP-NEXT:    vpor %ymm0, %ymm14, %ymm0
4403; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
4404; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
4405; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm14 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0]
4406; AVX2-FP-NEXT:    vpblendvb %ymm14, %ymm9, %ymm0, %ymm0
4407; AVX2-FP-NEXT:    vpshufb %ymm8, %ymm2, %ymm8
4408; AVX2-FP-NEXT:    vpshufb %ymm10, %ymm1, %ymm9
4409; AVX2-FP-NEXT:    vpor %ymm8, %ymm9, %ymm8
4410; AVX2-FP-NEXT:    vpshufb %ymm15, %ymm7, %ymm9
4411; AVX2-FP-NEXT:    vpshufb %ymm6, %ymm13, %ymm6
4412; AVX2-FP-NEXT:    vpor %ymm6, %ymm9, %ymm6
4413; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
4414; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3]
4415; AVX2-FP-NEXT:    vpblendvb %ymm14, %ymm8, %ymm6, %ymm6
4416; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm5[0,2,1,1,4,6,5,5]
4417; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2]
4418; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
4419; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm10
4420; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm3[0,2,1,1,4,6,5,5]
4421; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,3,2]
4422; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm6, %ymm0, %ymm9
4423; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [3,3,3,0,4,4,4,4]
4424; AVX2-FP-NEXT:    vpermd %ymm4, %ymm0, %ymm4
4425; AVX2-FP-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
4426; AVX2-FP-NEXT:    vpshufb %ymm6, %ymm11, %ymm8
4427; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255]
4428; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm4, %ymm8, %ymm4
4429; AVX2-FP-NEXT:    vpermd %ymm2, %ymm0, %ymm0
4430; AVX2-FP-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
4431; AVX2-FP-NEXT:    vpblendvb %ymm11, %ymm0, %ymm1, %ymm0
4432; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
4433; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4434; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
4435; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128]
4436; AVX2-FP-NEXT:    vpshufb %ymm6, %ymm12, %ymm8
4437; AVX2-FP-NEXT:    vpor %ymm2, %ymm8, %ymm2
4438; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255]
4439; AVX2-FP-NEXT:    vpblendvb %ymm8, %ymm4, %ymm2, %ymm2
4440; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm7, %ymm1
4441; AVX2-FP-NEXT:    vpshufb %ymm6, %ymm13, %ymm4
4442; AVX2-FP-NEXT:    vpor %ymm1, %ymm4, %ymm1
4443; AVX2-FP-NEXT:    vpblendvb %ymm8, %ymm0, %ymm1, %ymm1
4444; AVX2-FP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [3,3,3,3,0,4,4,4]
4445; AVX2-FP-NEXT:    vpermd %ymm5, %ymm4, %ymm0
4446; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
4447; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm2, %ymm0, %ymm0
4448; AVX2-FP-NEXT:    vpermd %ymm3, %ymm4, %ymm2
4449; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
4450; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4451; AVX2-FP-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
4452; AVX2-FP-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
4453; AVX2-FP-NEXT:    vmovdqa (%rsp), %xmm3 # 16-byte Reload
4454; AVX2-FP-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
4455; AVX2-FP-NEXT:    # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
4456; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
4457; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
4458; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
4459; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm5 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
4460; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
4461; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
4462; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255]
4463; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm2, %ymm3, %ymm2
4464; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4465; AVX2-FP-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
4466; AVX2-FP-NEXT:    # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
4467; AVX2-FP-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
4468; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4469; AVX2-FP-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
4470; AVX2-FP-NEXT:    # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
4471; AVX2-FP-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
4472; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
4473; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
4474; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm3, %ymm4, %ymm3
4475; AVX2-FP-NEXT:    vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4476; AVX2-FP-NEXT:    # xmm4 = mem[0,0,1,1]
4477; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1]
4478; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
4479; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm2, %ymm4, %ymm2
4480; AVX2-FP-NEXT:    vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4481; AVX2-FP-NEXT:    # xmm4 = mem[0,0,1,1]
4482; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1]
4483; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
4484; AVX2-FP-NEXT:    vmovdqa %ymm1, 64(%r9)
4485; AVX2-FP-NEXT:    vmovdqa %ymm0, 224(%r9)
4486; AVX2-FP-NEXT:    vmovdqa %ymm9, 96(%r9)
4487; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4488; AVX2-FP-NEXT:    vmovaps %ymm0, 128(%r9)
4489; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4490; AVX2-FP-NEXT:    vmovaps %ymm0, 288(%r9)
4491; AVX2-FP-NEXT:    vmovdqa %ymm10, 256(%r9)
4492; AVX2-FP-NEXT:    vmovdqa %ymm3, 160(%r9)
4493; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4494; AVX2-FP-NEXT:    vmovaps %ymm0, 192(%r9)
4495; AVX2-FP-NEXT:    vmovdqa %ymm2, (%r9)
4496; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4497; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%r9)
4498; AVX2-FP-NEXT:    addq $200, %rsp
4499; AVX2-FP-NEXT:    vzeroupper
4500; AVX2-FP-NEXT:    retq
4501;
4502; AVX2-FCP-LABEL: store_i8_stride5_vf64:
4503; AVX2-FCP:       # %bb.0:
4504; AVX2-FCP-NEXT:    subq $168, %rsp
4505; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %ymm14
4506; AVX2-FCP-NEXT:    vmovdqa 32(%rcx), %ymm15
4507; AVX2-FCP-NEXT:    vmovdqa (%r8), %ymm11
4508; AVX2-FCP-NEXT:    vmovdqa (%rcx), %xmm1
4509; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4510; AVX2-FCP-NEXT:    vmovdqa 32(%rcx), %xmm6
4511; AVX2-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4512; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12]
4513; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
4514; AVX2-FCP-NEXT:    vmovdqa (%rdx), %xmm3
4515; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4516; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %xmm8
4517; AVX2-FCP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4518; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
4519; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
4520; AVX2-FCP-NEXT:    vpor %xmm1, %xmm3, %xmm1
4521; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
4522; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm4
4523; AVX2-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4524; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
4525; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
4526; AVX2-FCP-NEXT:    vmovdqa (%rsi), %xmm7
4527; AVX2-FCP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4528; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
4529; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm7
4530; AVX2-FCP-NEXT:    vpor %xmm4, %xmm7, %xmm4
4531; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
4532; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
4533; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm1, %ymm4, %ymm1
4534; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm4
4535; AVX2-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4536; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
4537; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm8, %xmm2
4538; AVX2-FCP-NEXT:    vpor %xmm0, %xmm2, %xmm0
4539; AVX2-FCP-NEXT:    vmovdqa 32(%rsi), %xmm6
4540; AVX2-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4541; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm4, %xmm2
4542; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm6, %xmm3
4543; AVX2-FCP-NEXT:    vpor %xmm2, %xmm3, %xmm2
4544; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,1,2,2,2,2,2,2]
4545; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
4546; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
4547; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm0, %ymm2, %ymm0
4548; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm3, %ymm2
4549; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
4550; AVX2-FCP-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
4551; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4552; AVX2-FCP-NEXT:    vmovdqa 32(%r8), %ymm13
4553; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm3, %ymm1
4554; AVX2-FCP-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
4555; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4556; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,29,26,128,28,128,30,128,28,29,128,31,128,29]
4557; AVX2-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm1
4558; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
4559; AVX2-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
4560; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm15, %ymm2
4561; AVX2-FCP-NEXT:    vmovdqu %ymm15, (%rsp) # 32-byte Spill
4562; AVX2-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm1
4563; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
4564; AVX2-FCP-NEXT:    vmovdqa 32(%rsi), %ymm3
4565; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128]
4566; AVX2-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm4
4567; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30]
4568; AVX2-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
4569; AVX2-FCP-NEXT:    vpshufb %ymm8, %ymm3, %ymm5
4570; AVX2-FCP-NEXT:    vpor %ymm4, %ymm5, %ymm4
4571; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
4572; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3]
4573; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u]
4574; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm1, %ymm4, %ymm4
4575; AVX2-FCP-NEXT:    vmovdqa (%rdx), %ymm12
4576; AVX2-FCP-NEXT:    vpshufb %ymm0, %ymm12, %ymm0
4577; AVX2-FCP-NEXT:    vmovdqa (%rcx), %ymm5
4578; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm5, %ymm1
4579; AVX2-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm10
4580; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm1
4581; AVX2-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm9
4582; AVX2-FCP-NEXT:    vmovdqa (%rsi), %ymm0
4583; AVX2-FCP-NEXT:    vpshufb %ymm8, %ymm0, %ymm8
4584; AVX2-FCP-NEXT:    vpor %ymm9, %ymm8, %ymm8
4585; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm10[2,2,3,3]
4586; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
4587; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm9, %ymm8, %ymm7
4588; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [6,6,6,6,7,7,7,7]
4589; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm8, %ymm9
4590; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
4591; AVX2-FCP-NEXT:    vpblendvb %ymm10, %ymm4, %ymm9, %ymm4
4592; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4593; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm8, %ymm4
4594; AVX2-FCP-NEXT:    vpblendvb %ymm10, %ymm7, %ymm4, %ymm4
4595; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4596; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23]
4597; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm7
4598; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128]
4599; AVX2-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
4600; AVX2-FCP-NEXT:    vpshufb %ymm8, %ymm3, %ymm9
4601; AVX2-FCP-NEXT:    vpor %ymm7, %ymm9, %ymm7
4602; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
4603; AVX2-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
4604; AVX2-FCP-NEXT:    vpshufb %ymm9, %ymm15, %ymm10
4605; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25]
4606; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm14, %ymm15
4607; AVX2-FCP-NEXT:    vpor %ymm10, %ymm15, %ymm10
4608; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
4609; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3]
4610; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0]
4611; AVX2-FCP-NEXT:    vpblendvb %ymm15, %ymm7, %ymm10, %ymm7
4612; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm6
4613; AVX2-FCP-NEXT:    vpshufb %ymm8, %ymm0, %ymm8
4614; AVX2-FCP-NEXT:    vpor %ymm6, %ymm8, %ymm6
4615; AVX2-FCP-NEXT:    vpshufb %ymm9, %ymm5, %ymm8
4616; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm12, %ymm4
4617; AVX2-FCP-NEXT:    vpor %ymm4, %ymm8, %ymm4
4618; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3]
4619; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3]
4620; AVX2-FCP-NEXT:    vpblendvb %ymm15, %ymm6, %ymm4, %ymm4
4621; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [4,6,5,5,5,5,4,6]
4622; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm6, %ymm8
4623; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
4624; AVX2-FCP-NEXT:    vpblendvb %ymm10, %ymm7, %ymm8, %ymm9
4625; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm6, %ymm6
4626; AVX2-FCP-NEXT:    vpblendvb %ymm10, %ymm4, %ymm6, %ymm7
4627; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [3,3,3,0,4,4,4,4]
4628; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm4, %ymm2
4629; AVX2-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
4630; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
4631; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255]
4632; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm2, %ymm3, %ymm2
4633; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm4, %ymm1
4634; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
4635; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm1, %ymm0, %ymm0
4636; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
4637; AVX2-FCP-NEXT:    vmovdqu (%rsp), %ymm3 # 32-byte Reload
4638; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
4639; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128]
4640; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm14, %ymm6
4641; AVX2-FCP-NEXT:    vpor %ymm3, %ymm6, %ymm3
4642; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255]
4643; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm2, %ymm3, %ymm2
4644; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
4645; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm12, %ymm3
4646; AVX2-FCP-NEXT:    vpor %ymm1, %ymm3, %ymm1
4647; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm0, %ymm1, %ymm1
4648; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [3,3,3,3,0,4,4,4]
4649; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm3, %ymm0
4650; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
4651; AVX2-FCP-NEXT:    vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
4652; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm3, %ymm2
4653; AVX2-FCP-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
4654; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4655; AVX2-FCP-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
4656; AVX2-FCP-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
4657; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4658; AVX2-FCP-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
4659; AVX2-FCP-NEXT:    # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
4660; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
4661; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
4662; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
4663; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
4664; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
4665; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
4666; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255]
4667; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm2, %ymm3, %ymm2
4668; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4669; AVX2-FCP-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
4670; AVX2-FCP-NEXT:    # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
4671; AVX2-FCP-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
4672; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4673; AVX2-FCP-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
4674; AVX2-FCP-NEXT:    # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
4675; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
4676; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
4677; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
4678; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm3, %ymm4, %ymm3
4679; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1]
4680; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm4, %ymm5
4681; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
4682; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm2, %ymm5, %ymm2
4683; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm4, %ymm4
4684; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm3, %ymm4, %ymm3
4685; AVX2-FCP-NEXT:    vmovdqa %ymm1, 64(%r9)
4686; AVX2-FCP-NEXT:    vmovdqa %ymm0, 224(%r9)
4687; AVX2-FCP-NEXT:    vmovdqa %ymm7, 96(%r9)
4688; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4689; AVX2-FCP-NEXT:    vmovaps %ymm0, 128(%r9)
4690; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4691; AVX2-FCP-NEXT:    vmovaps %ymm0, 288(%r9)
4692; AVX2-FCP-NEXT:    vmovdqa %ymm9, 256(%r9)
4693; AVX2-FCP-NEXT:    vmovdqa %ymm3, 160(%r9)
4694; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4695; AVX2-FCP-NEXT:    vmovaps %ymm0, 192(%r9)
4696; AVX2-FCP-NEXT:    vmovdqa %ymm2, (%r9)
4697; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4698; AVX2-FCP-NEXT:    vmovaps %ymm0, 32(%r9)
4699; AVX2-FCP-NEXT:    addq $168, %rsp
4700; AVX2-FCP-NEXT:    vzeroupper
4701; AVX2-FCP-NEXT:    retq
4702;
4703; AVX512-LABEL: store_i8_stride5_vf64:
4704; AVX512:       # %bb.0:
4705; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm11
4706; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128]
4707; AVX512-NEXT:    vpshufb %ymm1, %ymm11, %ymm0
4708; AVX512-NEXT:    vmovdqa64 %ymm1, %ymm18
4709; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm5
4710; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19]
4711; AVX512-NEXT:    vpshufb %ymm2, %ymm5, %ymm1
4712; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm19
4713; AVX512-NEXT:    vporq %ymm0, %ymm1, %ymm20
4714; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm12
4715; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
4716; AVX512-NEXT:    vpshufb %xmm1, %xmm12, %xmm0
4717; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm28
4718; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm10
4719; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
4720; AVX512-NEXT:    vpshufb %xmm2, %xmm10, %xmm1
4721; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm29
4722; AVX512-NEXT:    vporq %xmm0, %xmm1, %xmm21
4723; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm15
4724; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
4725; AVX512-NEXT:    vpshufb %ymm8, %ymm15, %ymm0
4726; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm13
4727; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u]
4728; AVX512-NEXT:    vpshufb %ymm3, %ymm13, %ymm1
4729; AVX512-NEXT:    vporq %ymm0, %ymm1, %ymm22
4730; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm6
4731; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12]
4732; AVX512-NEXT:    vpshufb %xmm1, %xmm6, %xmm0
4733; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm30
4734; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm7
4735; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
4736; AVX512-NEXT:    vpshufb %xmm2, %xmm7, %xmm1
4737; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm31
4738; AVX512-NEXT:    vporq %xmm0, %xmm1, %xmm23
4739; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
4740; AVX512-NEXT:    # ymm9 = mem[0,1,0,1]
4741; AVX512-NEXT:    vpshufb %ymm9, %ymm5, %ymm0
4742; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
4743; AVX512-NEXT:    # ymm4 = mem[0,1,0,1]
4744; AVX512-NEXT:    vpshufb %ymm4, %ymm5, %ymm1
4745; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm24
4746; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30]
4747; AVX512-NEXT:    # ymm5 = mem[0,1,0,1]
4748; AVX512-NEXT:    vpshufb %ymm5, %ymm11, %ymm1
4749; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0]
4750; AVX512-NEXT:    # ymm2 = mem[0,1,0,1]
4751; AVX512-NEXT:    vpshufb %ymm2, %ymm11, %ymm11
4752; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm11, %zmm26
4753; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
4754; AVX512-NEXT:    # ymm11 = mem[0,1,0,1]
4755; AVX512-NEXT:    vpshufb %ymm11, %ymm13, %ymm1
4756; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
4757; AVX512-NEXT:    # ymm0 = mem[0,1,0,1]
4758; AVX512-NEXT:    vpshufb %ymm0, %ymm15, %ymm14
4759; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm14, %zmm25
4760; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0]
4761; AVX512-NEXT:    # ymm1 = mem[0,1,0,1]
4762; AVX512-NEXT:    vpshufb %ymm1, %ymm15, %ymm14
4763; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
4764; AVX512-NEXT:    # ymm15 = mem[0,1,0,1]
4765; AVX512-NEXT:    vpshufb %ymm15, %ymm13, %ymm13
4766; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm13, %zmm27
4767; AVX512-NEXT:    vmovdqa (%rcx), %ymm13
4768; AVX512-NEXT:    vpshufb %ymm8, %ymm13, %ymm8
4769; AVX512-NEXT:    vmovdqa (%rdx), %ymm14
4770; AVX512-NEXT:    vpshufb %ymm3, %ymm14, %ymm3
4771; AVX512-NEXT:    vporq %ymm8, %ymm3, %ymm16
4772; AVX512-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
4773; AVX512-NEXT:    vpshufb %ymm15, %ymm14, %ymm3
4774; AVX512-NEXT:    vporq %ymm0, %ymm3, %ymm17
4775; AVX512-NEXT:    vmovdqa (%rsi), %ymm3
4776; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm0
4777; AVX512-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
4778; AVX512-NEXT:    vmovdqa (%rdi), %ymm8
4779; AVX512-NEXT:    vmovdqa64 %ymm19, %ymm15
4780; AVX512-NEXT:    vpshufb %ymm15, %ymm8, %ymm15
4781; AVX512-NEXT:    vporq %ymm0, %ymm15, %ymm18
4782; AVX512-NEXT:    vpshufb %ymm4, %ymm8, %ymm0
4783; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
4784; AVX512-NEXT:    vporq %ymm0, %ymm2, %ymm19
4785; AVX512-NEXT:    vpshufb %ymm11, %ymm14, %ymm0
4786; AVX512-NEXT:    vpshufb %ymm1, %ymm13, %ymm1
4787; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
4788; AVX512-NEXT:    vpshufb %ymm9, %ymm8, %ymm1
4789; AVX512-NEXT:    vpshufb %ymm5, %ymm3, %ymm2
4790; AVX512-NEXT:    vmovdqa (%rdi), %xmm5
4791; AVX512-NEXT:    vpor %ymm1, %ymm2, %ymm1
4792; AVX512-NEXT:    vmovdqa (%rsi), %xmm9
4793; AVX512-NEXT:    vmovdqa (%rcx), %xmm8
4794; AVX512-NEXT:    vmovdqa64 %xmm28, %xmm2
4795; AVX512-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
4796; AVX512-NEXT:    vmovdqa64 %xmm29, %xmm3
4797; AVX512-NEXT:    vpshufb %xmm3, %xmm9, %xmm3
4798; AVX512-NEXT:    vpor %xmm2, %xmm3, %xmm4
4799; AVX512-NEXT:    vmovdqa (%rdx), %xmm3
4800; AVX512-NEXT:    vmovdqa 32(%r8), %ymm11
4801; AVX512-NEXT:    vmovdqa64 %xmm30, %xmm2
4802; AVX512-NEXT:    vpshufb %xmm2, %xmm8, %xmm2
4803; AVX512-NEXT:    vmovdqa64 %xmm31, %xmm13
4804; AVX512-NEXT:    vpshufb %xmm13, %xmm3, %xmm13
4805; AVX512-NEXT:    vpor %xmm2, %xmm13, %xmm13
4806; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128]
4807; AVX512-NEXT:    vpshufb %ymm14, %ymm11, %ymm2
4808; AVX512-NEXT:    vpshufd {{.*#+}} xmm15 = mem[1,1,2,2]
4809; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,1,1,1]
4810; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
4811; AVX512-NEXT:    vpandnq %ymm15, %ymm28, %ymm15
4812; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm15, %zmm2
4813; AVX512-NEXT:    vmovdqa (%r8), %ymm15
4814; AVX512-NEXT:    vpshufb %ymm14, %ymm15, %ymm14
4815; AVX512-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[0,2,1,1,4,6,5,5]
4816; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
4817; AVX512-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,3,3,2]
4818; AVX512-NEXT:    vpandnq %ymm15, %ymm29, %ymm15
4819; AVX512-NEXT:    vinserti64x4 $1, %ymm15, %zmm14, %zmm14
4820; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
4821; AVX512-NEXT:    vmovdqa {{.*#+}} xmm7 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
4822; AVX512-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
4823; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
4824; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7]
4825; AVX512-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
4826; AVX512-NEXT:    vpshufb %xmm12, %xmm10, %xmm10
4827; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1]
4828; AVX512-NEXT:    vmovdqa64 (%r8), %zmm15
4829; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm30 = [4,0,5,5,5,5,0,6,6,6,6,0,7,7,7,7]
4830; AVX512-NEXT:    vpermd %zmm11, %zmm30, %zmm30
4831; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm31 = [6,6,6,0,7,7,7,7,0,16,16,16,16,0,17,17]
4832; AVX512-NEXT:    vpermi2d %zmm11, %zmm15, %zmm31
4833; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
4834; AVX512-NEXT:    vpshufb %xmm12, %xmm5, %xmm5
4835; AVX512-NEXT:    vinserti32x4 $2, %xmm4, %zmm5, %zmm4
4836; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7]
4837; AVX512-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
4838; AVX512-NEXT:    vinserti32x4 $2, %xmm13, %zmm3, %zmm3
4839; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm21[0,0,1,1]
4840; AVX512-NEXT:    vinserti64x4 $1, %ymm20, %zmm5, %zmm5
4841; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm23[0,0,1,1]
4842; AVX512-NEXT:    vinserti64x4 $1, %ymm22, %zmm7, %zmm7
4843; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
4844; AVX512-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm8 & (zmm7 ^ zmm5))
4845; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm7 & zmm28)
4846; AVX512-NEXT:    vporq %zmm24, %zmm26, %zmm5
4847; AVX512-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[2,2,3,3,6,6,7,7]
4848; AVX512-NEXT:    vporq %zmm25, %zmm27, %zmm7
4849; AVX512-NEXT:    vpermq {{.*#+}} zmm7 = zmm7[2,2,3,3,6,6,7,7]
4850; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
4851; AVX512-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm9 & (zmm7 ^ zmm5))
4852; AVX512-NEXT:    vpternlogd {{.*#+}} zmm30 = zmm30 ^ (zmm29 & (zmm30 ^ zmm7))
4853; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm17[2,2,3,3]
4854; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm16, %zmm5
4855; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm19[2,2,3,3]
4856; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm18, %zmm7
4857; AVX512-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm8 & (zmm7 ^ zmm5))
4858; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
4859; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
4860; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
4861; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm1, %zmm1
4862; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm9 & (zmm1 ^ zmm0))
4863; AVX512-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm7 & mem)
4864; AVX512-NEXT:    vpternlogd {{.*#+}} zmm31 = zmm31 ^ (mem & (zmm31 ^ zmm1))
4865; AVX512-NEXT:    vpermq {{.*#+}} zmm0 = zmm4[0,0,1,1,4,4,5,5]
4866; AVX512-NEXT:    vpermq {{.*#+}} zmm1 = zmm3[0,0,1,1,4,4,5,5]
4867; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0))
4868; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0]
4869; AVX512-NEXT:    vpermd %zmm15, %zmm0, %zmm0
4870; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
4871; AVX512-NEXT:    vmovdqa64 %zmm14, 64(%r9)
4872; AVX512-NEXT:    vmovdqa64 %zmm0, (%r9)
4873; AVX512-NEXT:    vmovdqa64 %zmm31, 128(%r9)
4874; AVX512-NEXT:    vmovdqa64 %zmm30, 256(%r9)
4875; AVX512-NEXT:    vmovdqa64 %zmm2, 192(%r9)
4876; AVX512-NEXT:    vzeroupper
4877; AVX512-NEXT:    retq
4878;
4879; AVX512-FCP-LABEL: store_i8_stride5_vf64:
4880; AVX512-FCP:       # %bb.0:
4881; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm0
4882; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128]
4883; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm1
4884; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
4885; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19]
4886; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm3
4887; AVX512-FCP-NEXT:    vporq %ymm1, %ymm3, %ymm17
4888; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
4889; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm3[8],zero,xmm3[u,7],zero,xmm3[9],zero,xmm3[u],zero,xmm3[u,10],zero,xmm3[12],zero,xmm3[u,11]
4890; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm5
4891; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
4892; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm4
4893; AVX512-FCP-NEXT:    vmovdqa64 %xmm6, %xmm28
4894; AVX512-FCP-NEXT:    vporq %xmm1, %xmm4, %xmm18
4895; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm1
4896; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
4897; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm4
4898; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm11
4899; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u]
4900; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm6
4901; AVX512-FCP-NEXT:    vporq %ymm4, %ymm6, %ymm19
4902; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm6
4903; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,xmm6[6],zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9],zero,xmm6[11,u],zero,xmm6[10],zero,xmm6[12]
4904; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm7
4905; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
4906; AVX512-FCP-NEXT:    vpshufb %xmm13, %xmm7, %xmm8
4907; AVX512-FCP-NEXT:    vmovdqa64 %xmm13, %xmm29
4908; AVX512-FCP-NEXT:    vporq %xmm4, %xmm8, %xmm20
4909; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
4910; AVX512-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
4911; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm4
4912; AVX512-FCP-NEXT:    vmovdqa64 %ymm8, %ymm31
4913; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
4914; AVX512-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
4915; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm2
4916; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm21
4917; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30]
4918; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
4919; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm8
4920; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm30
4921; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0]
4922; AVX512-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
4923; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
4924; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm22
4925; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
4926; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
4927; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm8
4928; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
4929; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
4930; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm15
4931; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm15, %zmm24
4932; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0]
4933; AVX512-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
4934; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
4935; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
4936; AVX512-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
4937; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm11, %ymm11
4938; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm11, %zmm26
4939; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm1
4940; AVX512-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm10
4941; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm11
4942; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm12
4943; AVX512-FCP-NEXT:    vporq %ymm10, %ymm12, %ymm23
4944; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
4945; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm11, %ymm10
4946; AVX512-FCP-NEXT:    vporq %ymm0, %ymm10, %ymm25
4947; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm10
4948; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm10, %ymm0
4949; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm12
4950; AVX512-FCP-NEXT:    vpshufb %ymm9, %ymm12, %ymm9
4951; AVX512-FCP-NEXT:    vporq %ymm0, %ymm9, %ymm27
4952; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm12, %ymm0
4953; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm2
4954; AVX512-FCP-NEXT:    vporq %ymm0, %ymm2, %ymm16
4955; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm9
4956; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[u],zero,xmm9[u,10],zero,xmm9[12],zero,xmm9[u,11]
4957; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm14
4958; AVX512-FCP-NEXT:    vmovdqa64 %xmm28, %xmm2
4959; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm14, %xmm2
4960; AVX512-FCP-NEXT:    vporq %xmm0, %xmm2, %xmm28
4961; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm13
4962; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm13[6],zero,xmm13[8,u],zero,xmm13[7],zero,xmm13[9],zero,xmm13[11,u],zero,xmm13[10],zero,xmm13[12]
4963; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm15
4964; AVX512-FCP-NEXT:    vmovdqa64 %xmm29, %xmm2
4965; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm15, %xmm2
4966; AVX512-FCP-NEXT:    vporq %xmm0, %xmm2, %xmm29
4967; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm0
4968; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
4969; AVX512-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
4970; AVX512-FCP-NEXT:    vmovdqa64 %ymm31, %ymm1
4971; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm12, %ymm1
4972; AVX512-FCP-NEXT:    vmovdqa64 %ymm30, %ymm2
4973; AVX512-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm2
4974; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm4
4975; AVX512-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm1
4976; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128]
4977; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm2
4978; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [1,1,2,2,2,2,2,2]
4979; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm10, %ymm10
4980; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
4981; AVX512-FCP-NEXT:    vpandn %ymm10, %ymm11, %ymm10
4982; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm10, %zmm2
4983; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm10
4984; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm12 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31]
4985; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
4986; AVX512-FCP-NEXT:    vpermd %ymm10, %ymm12, %ymm31
4987; AVX512-FCP-NEXT:    vpandnq %ymm31, %ymm30, %ymm31
4988; AVX512-FCP-NEXT:    vpshufb %ymm8, %ymm10, %ymm8
4989; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm31, %zmm8, %zmm8
4990; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
4991; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
4992; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm9, %xmm9
4993; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm28, %zmm9, %zmm9
4994; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
4995; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
4996; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm13, %xmm13
4997; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm29, %zmm13, %zmm13
4998; AVX512-FCP-NEXT:    vmovdqa64 (%r8), %zmm28
4999; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm28, %zmm10
5000; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
5001; AVX512-FCP-NEXT:    vpshufb %xmm15, %xmm6, %xmm6
5002; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm7 = [2,2,3,3,8,8,9,9]
5003; AVX512-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm0
5004; AVX512-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
5005; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
5006; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm7, %zmm1
5007; AVX512-FCP-NEXT:    vporq %zmm21, %zmm22, %zmm3
5008; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[2,2,3,3,6,6,7,7]
5009; AVX512-FCP-NEXT:    vporq %zmm24, %zmm26, %zmm5
5010; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[2,2,3,3,6,6,7,7]
5011; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
5012; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm6 & (zmm5 ^ zmm3))
5013; AVX512-FCP-NEXT:    vpermt2d %zmm28, %zmm12, %zmm4
5014; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm4 ^ (zmm30 & (zmm4 ^ zmm5))
5015; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 256(%r9)
5016; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm18[0,0,1,1]
5017; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm3, %zmm3
5018; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm20[0,0,1,1]
5019; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm4, %zmm4
5020; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
5021; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3))
5022; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm4 & zmm11)
5023; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm25[2,2,3,3]
5024; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm23, %zmm3
5025; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm16[2,2,3,3]
5026; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm27, %zmm4
5027; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3))
5028; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm6 & (zmm1 ^ zmm0))
5029; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm4 & mem)
5030; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm9[0,0,1,1,4,4,5,5]
5031; AVX512-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm13[0,0,1,1,4,4,5,5]
5032; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0))
5033; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0]
5034; AVX512-FCP-NEXT:    vpermd %zmm10, %zmm0, %zmm0
5035; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3))
5036; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9]
5037; AVX512-FCP-NEXT:    vpermd %zmm28, %zmm3, %zmm3
5038; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm1))
5039; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 128(%r9)
5040; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 64(%r9)
5041; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%r9)
5042; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 192(%r9)
5043; AVX512-FCP-NEXT:    vzeroupper
5044; AVX512-FCP-NEXT:    retq
5045;
5046; AVX512DQ-LABEL: store_i8_stride5_vf64:
5047; AVX512DQ:       # %bb.0:
5048; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm11
5049; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128]
5050; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm11, %ymm0
5051; AVX512DQ-NEXT:    vmovdqa64 %ymm1, %ymm18
5052; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm5
5053; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19]
5054; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm5, %ymm1
5055; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm19
5056; AVX512DQ-NEXT:    vporq %ymm0, %ymm1, %ymm20
5057; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm12
5058; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
5059; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm12, %xmm0
5060; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm28
5061; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm10
5062; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
5063; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm10, %xmm1
5064; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm29
5065; AVX512DQ-NEXT:    vporq %xmm0, %xmm1, %xmm21
5066; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm15
5067; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
5068; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm15, %ymm0
5069; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm13
5070; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u]
5071; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm13, %ymm1
5072; AVX512DQ-NEXT:    vporq %ymm0, %ymm1, %ymm22
5073; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm6
5074; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12]
5075; AVX512DQ-NEXT:    vpshufb %xmm1, %xmm6, %xmm0
5076; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm30
5077; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm7
5078; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
5079; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm7, %xmm1
5080; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm31
5081; AVX512DQ-NEXT:    vporq %xmm0, %xmm1, %xmm23
5082; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
5083; AVX512DQ-NEXT:    # ymm9 = mem[0,1,0,1]
5084; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm5, %ymm0
5085; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
5086; AVX512DQ-NEXT:    # ymm4 = mem[0,1,0,1]
5087; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm5, %ymm1
5088; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm24
5089; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30]
5090; AVX512DQ-NEXT:    # ymm5 = mem[0,1,0,1]
5091; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm11, %ymm1
5092; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0]
5093; AVX512DQ-NEXT:    # ymm2 = mem[0,1,0,1]
5094; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm11, %ymm11
5095; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm11, %zmm26
5096; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm11 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
5097; AVX512DQ-NEXT:    # ymm11 = mem[0,1,0,1]
5098; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm13, %ymm1
5099; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
5100; AVX512DQ-NEXT:    # ymm0 = mem[0,1,0,1]
5101; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm15, %ymm14
5102; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm14, %zmm25
5103; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0]
5104; AVX512DQ-NEXT:    # ymm1 = mem[0,1,0,1]
5105; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm15, %ymm14
5106; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
5107; AVX512DQ-NEXT:    # ymm15 = mem[0,1,0,1]
5108; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm13, %ymm13
5109; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm14, %zmm13, %zmm27
5110; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm13
5111; AVX512DQ-NEXT:    vpshufb %ymm8, %ymm13, %ymm8
5112; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm14
5113; AVX512DQ-NEXT:    vpshufb %ymm3, %ymm14, %ymm3
5114; AVX512DQ-NEXT:    vporq %ymm8, %ymm3, %ymm16
5115; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
5116; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm14, %ymm3
5117; AVX512DQ-NEXT:    vporq %ymm0, %ymm3, %ymm17
5118; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm3
5119; AVX512DQ-NEXT:    vmovdqa64 %ymm18, %ymm0
5120; AVX512DQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
5121; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm8
5122; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm15
5123; AVX512DQ-NEXT:    vpshufb %ymm15, %ymm8, %ymm15
5124; AVX512DQ-NEXT:    vporq %ymm0, %ymm15, %ymm18
5125; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm8, %ymm0
5126; AVX512DQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
5127; AVX512DQ-NEXT:    vporq %ymm0, %ymm2, %ymm19
5128; AVX512DQ-NEXT:    vpshufb %ymm11, %ymm14, %ymm0
5129; AVX512DQ-NEXT:    vpshufb %ymm1, %ymm13, %ymm1
5130; AVX512DQ-NEXT:    vpor %ymm0, %ymm1, %ymm0
5131; AVX512DQ-NEXT:    vpshufb %ymm9, %ymm8, %ymm1
5132; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm3, %ymm2
5133; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm5
5134; AVX512DQ-NEXT:    vpor %ymm1, %ymm2, %ymm1
5135; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm9
5136; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm8
5137; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm2
5138; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
5139; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm3
5140; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm9, %xmm3
5141; AVX512DQ-NEXT:    vpor %xmm2, %xmm3, %xmm4
5142; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm3
5143; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm11
5144; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm2
5145; AVX512DQ-NEXT:    vpshufb %xmm2, %xmm8, %xmm2
5146; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm13
5147; AVX512DQ-NEXT:    vpshufb %xmm13, %xmm3, %xmm13
5148; AVX512DQ-NEXT:    vpor %xmm2, %xmm13, %xmm13
5149; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm14 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128]
5150; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm11, %ymm2
5151; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = mem[1,1,2,2]
5152; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,1,1,1]
5153; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
5154; AVX512DQ-NEXT:    vpandnq %ymm15, %ymm28, %ymm15
5155; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm15, %zmm2
5156; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm15
5157; AVX512DQ-NEXT:    vpshufb %ymm14, %ymm15, %ymm14
5158; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[0,2,1,1,4,6,5,5]
5159; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
5160; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,3,3,2]
5161; AVX512DQ-NEXT:    vpandnq %ymm15, %ymm29, %ymm15
5162; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm15, %zmm14, %zmm14
5163; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
5164; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm7 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
5165; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
5166; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
5167; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7]
5168; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
5169; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm10, %xmm10
5170; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1]
5171; AVX512DQ-NEXT:    vmovdqa64 (%r8), %zmm15
5172; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm30 = [4,0,5,5,5,5,0,6,6,6,6,0,7,7,7,7]
5173; AVX512DQ-NEXT:    vpermd %zmm11, %zmm30, %zmm30
5174; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm31 = [6,6,6,0,7,7,7,7,0,16,16,16,16,0,17,17]
5175; AVX512DQ-NEXT:    vpermi2d %zmm11, %zmm15, %zmm31
5176; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
5177; AVX512DQ-NEXT:    vpshufb %xmm12, %xmm5, %xmm5
5178; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm4, %zmm5, %zmm4
5179; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7]
5180; AVX512DQ-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
5181; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm13, %zmm3, %zmm3
5182; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm21[0,0,1,1]
5183; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm20, %zmm5, %zmm5
5184; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm23[0,0,1,1]
5185; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm22, %zmm7, %zmm7
5186; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
5187; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm8 & (zmm7 ^ zmm5))
5188; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm7 & zmm28)
5189; AVX512DQ-NEXT:    vporq %zmm24, %zmm26, %zmm5
5190; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[2,2,3,3,6,6,7,7]
5191; AVX512DQ-NEXT:    vporq %zmm25, %zmm27, %zmm7
5192; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm7 = zmm7[2,2,3,3,6,6,7,7]
5193; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
5194; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm9 & (zmm7 ^ zmm5))
5195; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm30 = zmm30 ^ (zmm29 & (zmm30 ^ zmm7))
5196; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm17[2,2,3,3]
5197; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm16, %zmm5
5198; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm19[2,2,3,3]
5199; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm18, %zmm7
5200; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm8 & (zmm7 ^ zmm5))
5201; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
5202; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm0, %zmm0
5203; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
5204; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm1, %zmm1
5205; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm9 & (zmm1 ^ zmm0))
5206; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm7 & mem)
5207; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm31 = zmm31 ^ (mem & (zmm31 ^ zmm1))
5208; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm0 = zmm4[0,0,1,1,4,4,5,5]
5209; AVX512DQ-NEXT:    vpermq {{.*#+}} zmm1 = zmm3[0,0,1,1,4,4,5,5]
5210; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0))
5211; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0]
5212; AVX512DQ-NEXT:    vpermd %zmm15, %zmm0, %zmm0
5213; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
5214; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 64(%r9)
5215; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%r9)
5216; AVX512DQ-NEXT:    vmovdqa64 %zmm31, 128(%r9)
5217; AVX512DQ-NEXT:    vmovdqa64 %zmm30, 256(%r9)
5218; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 192(%r9)
5219; AVX512DQ-NEXT:    vzeroupper
5220; AVX512DQ-NEXT:    retq
5221;
5222; AVX512DQ-FCP-LABEL: store_i8_stride5_vf64:
5223; AVX512DQ-FCP:       # %bb.0:
5224; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm0
5225; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128]
5226; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm0, %ymm1
5227; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
5228; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19]
5229; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm2, %ymm3
5230; AVX512DQ-FCP-NEXT:    vporq %ymm1, %ymm3, %ymm17
5231; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
5232; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm3[8],zero,xmm3[u,7],zero,xmm3[9],zero,xmm3[u],zero,xmm3[u,10],zero,xmm3[12],zero,xmm3[u,11]
5233; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm5
5234; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
5235; AVX512DQ-FCP-NEXT:    vpshufb %xmm6, %xmm5, %xmm4
5236; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm6, %xmm28
5237; AVX512DQ-FCP-NEXT:    vporq %xmm1, %xmm4, %xmm18
5238; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm1
5239; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
5240; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm4
5241; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm11
5242; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u]
5243; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm6
5244; AVX512DQ-FCP-NEXT:    vporq %ymm4, %ymm6, %ymm19
5245; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm6
5246; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = zero,xmm6[6],zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9],zero,xmm6[11,u],zero,xmm6[10],zero,xmm6[12]
5247; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm7
5248; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
5249; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm7, %xmm8
5250; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm13, %xmm29
5251; AVX512DQ-FCP-NEXT:    vporq %xmm4, %xmm8, %xmm20
5252; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
5253; AVX512DQ-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
5254; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm2, %ymm4
5255; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm8, %ymm31
5256; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
5257; AVX512DQ-FCP-NEXT:    # ymm13 = mem[0,1,0,1]
5258; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm2
5259; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm21
5260; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30]
5261; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
5262; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm8
5263; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm30
5264; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0]
5265; AVX512DQ-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
5266; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
5267; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm22
5268; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
5269; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
5270; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm8
5271; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
5272; AVX512DQ-FCP-NEXT:    # ymm0 = mem[0,1,0,1]
5273; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm15
5274; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm15, %zmm24
5275; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0]
5276; AVX512DQ-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
5277; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
5278; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
5279; AVX512DQ-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
5280; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm11, %ymm11
5281; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm11, %zmm26
5282; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm1
5283; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm1, %ymm10
5284; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm11
5285; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm12
5286; AVX512DQ-FCP-NEXT:    vporq %ymm10, %ymm12, %ymm23
5287; AVX512DQ-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
5288; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm11, %ymm10
5289; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm10, %ymm25
5290; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm10
5291; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm10, %ymm0
5292; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm12
5293; AVX512DQ-FCP-NEXT:    vpshufb %ymm9, %ymm12, %ymm9
5294; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm9, %ymm27
5295; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm12, %ymm0
5296; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm2
5297; AVX512DQ-FCP-NEXT:    vporq %ymm0, %ymm2, %ymm16
5298; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm9
5299; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[u],zero,xmm9[u,10],zero,xmm9[12],zero,xmm9[u,11]
5300; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm14
5301; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm2
5302; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm14, %xmm2
5303; AVX512DQ-FCP-NEXT:    vporq %xmm0, %xmm2, %xmm28
5304; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm13
5305; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm13[6],zero,xmm13[8,u],zero,xmm13[7],zero,xmm13[9],zero,xmm13[11,u],zero,xmm13[10],zero,xmm13[12]
5306; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm15
5307; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm2
5308; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm15, %xmm2
5309; AVX512DQ-FCP-NEXT:    vporq %xmm0, %xmm2, %xmm29
5310; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm11, %ymm0
5311; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
5312; AVX512DQ-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
5313; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm1
5314; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm12, %ymm1
5315; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm2
5316; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm2
5317; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm4
5318; AVX512DQ-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm1
5319; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128]
5320; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm4, %ymm2
5321; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [1,1,2,2,2,2,2,2]
5322; AVX512DQ-FCP-NEXT:    vpermd %ymm4, %ymm10, %ymm10
5323; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
5324; AVX512DQ-FCP-NEXT:    vpandn %ymm10, %ymm11, %ymm10
5325; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm10, %zmm2
5326; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm10
5327; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm12 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31]
5328; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
5329; AVX512DQ-FCP-NEXT:    vpermd %ymm10, %ymm12, %ymm31
5330; AVX512DQ-FCP-NEXT:    vpandnq %ymm31, %ymm30, %ymm31
5331; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm10, %ymm8
5332; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm31, %zmm8, %zmm8
5333; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
5334; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
5335; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm9, %xmm9
5336; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm28, %zmm9, %zmm9
5337; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
5338; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm15 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
5339; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm13, %xmm13
5340; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm29, %zmm13, %zmm13
5341; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r8), %zmm28
5342; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm28, %zmm10
5343; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
5344; AVX512DQ-FCP-NEXT:    vpshufb %xmm15, %xmm6, %xmm6
5345; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm7 = [2,2,3,3,8,8,9,9]
5346; AVX512DQ-FCP-NEXT:    vpermt2q %zmm6, %zmm7, %zmm0
5347; AVX512DQ-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
5348; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
5349; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm7, %zmm1
5350; AVX512DQ-FCP-NEXT:    vporq %zmm21, %zmm22, %zmm3
5351; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm3[2,2,3,3,6,6,7,7]
5352; AVX512DQ-FCP-NEXT:    vporq %zmm24, %zmm26, %zmm5
5353; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[2,2,3,3,6,6,7,7]
5354; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
5355; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm6 & (zmm5 ^ zmm3))
5356; AVX512DQ-FCP-NEXT:    vpermt2d %zmm28, %zmm12, %zmm4
5357; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm4 ^ (zmm30 & (zmm4 ^ zmm5))
5358; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 256(%r9)
5359; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm18[0,0,1,1]
5360; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm3, %zmm3
5361; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm20[0,0,1,1]
5362; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm4, %zmm4
5363; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
5364; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3))
5365; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm4 & zmm11)
5366; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm25[2,2,3,3]
5367; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm23, %zmm3
5368; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm16[2,2,3,3]
5369; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm27, %zmm4
5370; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3))
5371; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm6 & (zmm1 ^ zmm0))
5372; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm4 & mem)
5373; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm9[0,0,1,1,4,4,5,5]
5374; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} zmm3 = zmm13[0,0,1,1,4,4,5,5]
5375; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0))
5376; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0]
5377; AVX512DQ-FCP-NEXT:    vpermd %zmm10, %zmm0, %zmm0
5378; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3))
5379; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9]
5380; AVX512DQ-FCP-NEXT:    vpermd %zmm28, %zmm3, %zmm3
5381; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm1))
5382; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 128(%r9)
5383; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 64(%r9)
5384; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%r9)
5385; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 192(%r9)
5386; AVX512DQ-FCP-NEXT:    vzeroupper
5387; AVX512DQ-FCP-NEXT:    retq
5388;
5389; AVX512BW-LABEL: store_i8_stride5_vf64:
5390; AVX512BW:       # %bb.0:
5391; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm2
5392; AVX512BW-NEXT:    vmovdqa (%rcx), %ymm0
5393; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12]
5394; AVX512BW-NEXT:    vpshufb %ymm8, %ymm0, %ymm3
5395; AVX512BW-NEXT:    vmovdqa (%rdx), %ymm1
5396; AVX512BW-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
5397; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7]
5398; AVX512BW-NEXT:    movl $693250386, %eax # imm = 0x29522952
5399; AVX512BW-NEXT:    kmovd %eax, %k1
5400; AVX512BW-NEXT:    vmovdqu8 %ymm4, %ymm3 {%k1}
5401; AVX512BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3]
5402; AVX512BW-NEXT:    vmovdqa 32(%rdx), %xmm6
5403; AVX512BW-NEXT:    vmovdqa 32(%rcx), %xmm12
5404; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
5405; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm7 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
5406; AVX512BW-NEXT:    vpshufb %xmm7, %xmm4, %xmm4
5407; AVX512BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
5408; AVX512BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm10
5409; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm4
5410; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm15 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14]
5411; AVX512BW-NEXT:    vpshufb %ymm15, %ymm4, %ymm3
5412; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm5
5413; AVX512BW-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm5[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
5414; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7]
5415; AVX512BW-NEXT:    movl $1251232404, %eax # imm = 0x4A944A94
5416; AVX512BW-NEXT:    kmovd %eax, %k5
5417; AVX512BW-NEXT:    vmovdqu8 %ymm9, %ymm3 {%k5}
5418; AVX512BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3]
5419; AVX512BW-NEXT:    vmovdqa 32(%rsi), %xmm13
5420; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm14
5421; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
5422; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
5423; AVX512BW-NEXT:    vpshufb %xmm9, %xmm11, %xmm11
5424; AVX512BW-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,1,1]
5425; AVX512BW-NEXT:    vinserti64x4 $1, %ymm11, %zmm3, %zmm3
5426; AVX512BW-NEXT:    movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631
5427; AVX512BW-NEXT:    kmovq %rax, %k4
5428; AVX512BW-NEXT:    vmovdqu8 %zmm10, %zmm3 {%k4}
5429; AVX512BW-NEXT:    vmovdqa64 32(%r8), %ymm16
5430; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [6,6,6,6,7,7,7,7,16,16,16,16,16,16,17,17]
5431; AVX512BW-NEXT:    vpermi2d %zmm16, %zmm2, %zmm10
5432; AVX512BW-NEXT:    movabsq $2380225041768974402, %rax # imm = 0x2108421084210842
5433; AVX512BW-NEXT:    kmovq %rax, %k2
5434; AVX512BW-NEXT:    vmovdqu8 %zmm10, %zmm3 {%k2}
5435; AVX512BW-NEXT:    vmovdqa64 32(%rdx), %ymm23
5436; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128]
5437; AVX512BW-NEXT:    vpshufb %ymm10, %ymm23, %ymm17
5438; AVX512BW-NEXT:    vmovdqa64 32(%rcx), %ymm24
5439; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
5440; AVX512BW-NEXT:    vpshufb %ymm11, %ymm24, %ymm18
5441; AVX512BW-NEXT:    vporq %ymm17, %ymm18, %ymm17
5442; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm20 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12]
5443; AVX512BW-NEXT:    vpshufb %xmm20, %xmm12, %xmm12
5444; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm22 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
5445; AVX512BW-NEXT:    vpshufb %xmm22, %xmm6, %xmm6
5446; AVX512BW-NEXT:    vpor %xmm6, %xmm12, %xmm6
5447; AVX512BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
5448; AVX512BW-NEXT:    vinserti64x4 $1, %ymm17, %zmm6, %zmm6
5449; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
5450; AVX512BW-NEXT:    vpshufb %xmm19, %xmm14, %xmm12
5451; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm21 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
5452; AVX512BW-NEXT:    vpshufb %xmm21, %xmm13, %xmm13
5453; AVX512BW-NEXT:    vpor %xmm12, %xmm13, %xmm12
5454; AVX512BW-NEXT:    vpermq {{.*#+}} ymm14 = ymm12[0,0,1,1]
5455; AVX512BW-NEXT:    vmovdqa64 32(%rdi), %ymm25
5456; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [3,3,3,0,4,4,4,4]
5457; AVX512BW-NEXT:    vpermd %ymm25, %ymm12, %ymm17
5458; AVX512BW-NEXT:    vmovdqa64 32(%rsi), %ymm26
5459; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm13 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
5460; AVX512BW-NEXT:    movl $138547332, %eax # imm = 0x8421084
5461; AVX512BW-NEXT:    kmovd %eax, %k3
5462; AVX512BW-NEXT:    vpshufb %ymm13, %ymm26, %ymm17 {%k3}
5463; AVX512BW-NEXT:    vinserti64x4 $1, %ymm17, %zmm14, %zmm14
5464; AVX512BW-NEXT:    movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
5465; AVX512BW-NEXT:    kmovq %rax, %k2
5466; AVX512BW-NEXT:    vmovdqu8 %zmm14, %zmm6 {%k2}
5467; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [3,3,3,3,0,4,4,4]
5468; AVX512BW-NEXT:    vpermd %ymm16, %ymm14, %ymm17
5469; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm18 = mem[1,1,2,2]
5470; AVX512BW-NEXT:    vpermq {{.*#+}} ymm18 = ymm18[0,1,1,1]
5471; AVX512BW-NEXT:    vinserti64x4 $1, %ymm17, %zmm18, %zmm17
5472; AVX512BW-NEXT:    movabsq $4760450083537948804, %rax # imm = 0x4210842108421084
5473; AVX512BW-NEXT:    kmovq %rax, %k6
5474; AVX512BW-NEXT:    vmovdqu8 %zmm17, %zmm6 {%k6}
5475; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm17 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128]
5476; AVX512BW-NEXT:    # ymm17 = mem[0,1,2,3,0,1,2,3]
5477; AVX512BW-NEXT:    vpshufb %ymm17, %ymm26, %ymm27
5478; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
5479; AVX512BW-NEXT:    vpshufb %ymm18, %ymm25, %ymm28
5480; AVX512BW-NEXT:    vporq %ymm27, %ymm28, %ymm27
5481; AVX512BW-NEXT:    vpshufb %ymm15, %ymm26, %ymm15
5482; AVX512BW-NEXT:    vpshufhw {{.*#+}} ymm25 = ymm25[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
5483; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm25 = ymm25[2,2,3,3,6,6,7,7]
5484; AVX512BW-NEXT:    vmovdqu8 %ymm25, %ymm15 {%k5}
5485; AVX512BW-NEXT:    vinserti64x4 $1, %ymm15, %zmm27, %zmm15
5486; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm25 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
5487; AVX512BW-NEXT:    vpshufb %ymm25, %ymm23, %ymm26
5488; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm27 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
5489; AVX512BW-NEXT:    # ymm27 = mem[0,1,2,3,0,1,2,3]
5490; AVX512BW-NEXT:    vpshufb %ymm27, %ymm24, %ymm28
5491; AVX512BW-NEXT:    vporq %ymm26, %ymm28, %ymm26
5492; AVX512BW-NEXT:    vpshufb %ymm8, %ymm24, %ymm8
5493; AVX512BW-NEXT:    vpshufhw {{.*#+}} ymm23 = ymm23[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
5494; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm23 = ymm23[2,2,3,3,6,6,7,7]
5495; AVX512BW-NEXT:    vmovdqu8 %ymm23, %ymm8 {%k1}
5496; AVX512BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm26, %zmm8
5497; AVX512BW-NEXT:    vpermq {{.*#+}} zmm15 = zmm15[2,2,3,3,6,6,7,7]
5498; AVX512BW-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[2,2,3,3,6,6,7,7]
5499; AVX512BW-NEXT:    vmovdqu8 %zmm15, %zmm8 {%k4}
5500; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm15 = [4,6,5,5,5,5,4,6,6,6,6,6,7,7,7,7]
5501; AVX512BW-NEXT:    vpermd %zmm16, %zmm15, %zmm15
5502; AVX512BW-NEXT:    vmovdqa64 (%rdx), %xmm16
5503; AVX512BW-NEXT:    movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108
5504; AVX512BW-NEXT:    kmovq %rax, %k1
5505; AVX512BW-NEXT:    vmovdqu8 %zmm15, %zmm8 {%k1}
5506; AVX512BW-NEXT:    vmovdqa (%rcx), %xmm15
5507; AVX512BW-NEXT:    vpshufb %xmm20, %xmm15, %xmm20
5508; AVX512BW-NEXT:    vpshufb %xmm22, %xmm16, %xmm22
5509; AVX512BW-NEXT:    vporq %xmm20, %xmm22, %xmm20
5510; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7]
5511; AVX512BW-NEXT:    vpshufb %xmm7, %xmm15, %xmm7
5512; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm15
5513; AVX512BW-NEXT:    vinserti32x4 $2, %xmm20, %zmm7, %zmm7
5514; AVX512BW-NEXT:    vmovdqa64 (%rdi), %xmm16
5515; AVX512BW-NEXT:    vpshufb %xmm19, %xmm16, %xmm19
5516; AVX512BW-NEXT:    vpshufb %xmm21, %xmm15, %xmm20
5517; AVX512BW-NEXT:    vporq %xmm19, %xmm20, %xmm19
5518; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7]
5519; AVX512BW-NEXT:    vpshufb %xmm9, %xmm15, %xmm9
5520; AVX512BW-NEXT:    vinserti32x4 $2, %xmm19, %zmm9, %zmm9
5521; AVX512BW-NEXT:    vpermq {{.*#+}} zmm7 = zmm7[0,0,1,1,4,4,5,5]
5522; AVX512BW-NEXT:    vpermq {{.*#+}} zmm9 = zmm9[0,0,1,1,4,4,5,5]
5523; AVX512BW-NEXT:    movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C
5524; AVX512BW-NEXT:    kmovq %rax, %k1
5525; AVX512BW-NEXT:    vmovdqu8 %zmm7, %zmm9 {%k1}
5526; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2]
5527; AVX512BW-NEXT:    vpermd %zmm2, %zmm7, %zmm2
5528; AVX512BW-NEXT:    movabsq $595056260442243600, %rax # imm = 0x842108421084210
5529; AVX512BW-NEXT:    kmovq %rax, %k1
5530; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm9 {%k1}
5531; AVX512BW-NEXT:    vpshufb %ymm10, %ymm1, %ymm2
5532; AVX512BW-NEXT:    vpshufb %ymm11, %ymm0, %ymm7
5533; AVX512BW-NEXT:    vpor %ymm2, %ymm7, %ymm2
5534; AVX512BW-NEXT:    vpshufb %ymm25, %ymm1, %ymm1
5535; AVX512BW-NEXT:    vpshufb %ymm27, %ymm0, %ymm0
5536; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
5537; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
5538; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
5539; AVX512BW-NEXT:    vpermd %ymm5, %ymm12, %ymm1
5540; AVX512BW-NEXT:    vpshufb %ymm13, %ymm4, %ymm1 {%k3}
5541; AVX512BW-NEXT:    vpshufb %ymm17, %ymm4, %ymm2
5542; AVX512BW-NEXT:    vpshufb %ymm18, %ymm5, %ymm4
5543; AVX512BW-NEXT:    vpor %ymm2, %ymm4, %ymm2
5544; AVX512BW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
5545; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
5546; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k2}
5547; AVX512BW-NEXT:    vmovdqa (%r8), %ymm0
5548; AVX512BW-NEXT:    vpermd %ymm0, %ymm14, %ymm2
5549; AVX512BW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,1,1,4,6,5,5]
5550; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,3,2]
5551; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
5552; AVX512BW-NEXT:    movabsq $1190112520884487201, %rax # imm = 0x1084210842108421
5553; AVX512BW-NEXT:    kmovq %rax, %k1
5554; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
5555; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%r9)
5556; AVX512BW-NEXT:    vmovdqa64 %zmm9, (%r9)
5557; AVX512BW-NEXT:    vmovdqa64 %zmm8, 256(%r9)
5558; AVX512BW-NEXT:    vmovdqa64 %zmm6, 192(%r9)
5559; AVX512BW-NEXT:    vmovdqa64 %zmm3, 128(%r9)
5560; AVX512BW-NEXT:    vzeroupper
5561; AVX512BW-NEXT:    retq
5562;
5563; AVX512BW-FCP-LABEL: store_i8_stride5_vf64:
5564; AVX512BW-FCP:       # %bb.0:
5565; AVX512BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm0
5566; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdx), %ymm8
5567; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128]
5568; AVX512BW-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm1
5569; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rcx), %ymm21
5570; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
5571; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm21, %ymm2
5572; AVX512BW-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm1
5573; AVX512BW-FCP-NEXT:    vmovdqa (%rcx), %xmm11
5574; AVX512BW-FCP-NEXT:    vmovdqa 32(%rcx), %xmm2
5575; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12]
5576; AVX512BW-FCP-NEXT:    vpshufb %xmm14, %xmm2, %xmm3
5577; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %xmm12
5578; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdx), %xmm4
5579; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
5580; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm4, %xmm5
5581; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm5, %xmm3
5582; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
5583; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
5584; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm13
5585; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
5586; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
5587; AVX512BW-FCP-NEXT:    vpshufb %xmm19, %xmm3, %xmm9
5588; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm18
5589; AVX512BW-FCP-NEXT:    vmovdqa 32(%rsi), %xmm5
5590; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm20 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
5591; AVX512BW-FCP-NEXT:    vpshufb %xmm20, %xmm5, %xmm10
5592; AVX512BW-FCP-NEXT:    vpor %xmm9, %xmm10, %xmm9
5593; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm9[0,0,1,1]
5594; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdi), %ymm16
5595; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4]
5596; AVX512BW-FCP-NEXT:    vpermd %ymm16, %ymm9, %ymm22
5597; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rsi), %ymm23
5598; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
5599; AVX512BW-FCP-NEXT:    movl $138547332, %eax # imm = 0x8421084
5600; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
5601; AVX512BW-FCP-NEXT:    vpshufb %ymm10, %ymm23, %ymm22 {%k1}
5602; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm15, %zmm15
5603; AVX512BW-FCP-NEXT:    movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
5604; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
5605; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm15, %zmm1 {%k2}
5606; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm15 = [9,9,10,10,10,10,10,10,11,11,11,11,0,12,12,12]
5607; AVX512BW-FCP-NEXT:    vpermd %zmm0, %zmm15, %zmm15
5608; AVX512BW-FCP-NEXT:    movabsq $4760450083537948804, %rax # imm = 0x4210842108421084
5609; AVX512BW-FCP-NEXT:    kmovq %rax, %k3
5610; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm15, %zmm1 {%k3}
5611; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],mem[4,5,6,7]
5612; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30]
5613; AVX512BW-FCP-NEXT:    vpshufb %zmm15, %zmm22, %zmm22
5614; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm16[0,1,2,3],mem[4,5,6,7]
5615; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
5616; AVX512BW-FCP-NEXT:    vpshufb %zmm16, %zmm23, %zmm23
5617; AVX512BW-FCP-NEXT:    vporq %zmm22, %zmm23, %zmm22
5618; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm22 = zmm22[2,2,3,3,6,6,7,7]
5619; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],mem[4,5,6,7]
5620; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
5621; AVX512BW-FCP-NEXT:    vpshufb %zmm23, %zmm8, %zmm8
5622; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],mem[4,5,6,7]
5623; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
5624; AVX512BW-FCP-NEXT:    vpshufb %zmm24, %zmm21, %zmm21
5625; AVX512BW-FCP-NEXT:    vporq %zmm8, %zmm21, %zmm8
5626; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[2,2,3,3,6,6,7,7]
5627; AVX512BW-FCP-NEXT:    movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631
5628; AVX512BW-FCP-NEXT:    kmovq %rax, %k3
5629; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm22, %zmm8 {%k3}
5630; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm21 = [12,14,13,13,13,13,12,14,14,14,14,14,15,15,15,15]
5631; AVX512BW-FCP-NEXT:    vpermd %zmm0, %zmm21, %zmm21
5632; AVX512BW-FCP-NEXT:    movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108
5633; AVX512BW-FCP-NEXT:    kmovq %rax, %k4
5634; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm21, %zmm8 {%k4}
5635; AVX512BW-FCP-NEXT:    vpshufb %xmm14, %xmm11, %xmm14
5636; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm12, %xmm17
5637; AVX512BW-FCP-NEXT:    vporq %xmm14, %xmm17, %xmm14
5638; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
5639; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
5640; AVX512BW-FCP-NEXT:    vpshufb %xmm12, %xmm11, %xmm11
5641; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm14, %zmm11, %zmm11
5642; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm14 = zmm11[0,0,1,1,4,4,5,5]
5643; AVX512BW-FCP-NEXT:    vpshufb %xmm19, %xmm13, %xmm11
5644; AVX512BW-FCP-NEXT:    vpshufb %xmm20, %xmm18, %xmm17
5645; AVX512BW-FCP-NEXT:    vporq %xmm11, %xmm17, %xmm11
5646; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm18[0],xmm13[1],xmm18[1],xmm13[2],xmm18[2],xmm13[3],xmm18[3],xmm13[4],xmm18[4],xmm13[5],xmm18[5],xmm13[6],xmm18[6],xmm13[7],xmm18[7]
5647; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
5648; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm13, %xmm13
5649; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm11, %zmm13, %zmm11
5650; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm11 = zmm11[0,0,1,1,4,4,5,5]
5651; AVX512BW-FCP-NEXT:    movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C
5652; AVX512BW-FCP-NEXT:    kmovq %rax, %k4
5653; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm11 {%k4}
5654; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3]
5655; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10]
5656; AVX512BW-FCP-NEXT:    vpermd %zmm13, %zmm14, %zmm14
5657; AVX512BW-FCP-NEXT:    movabsq $595056260442243600, %rax # imm = 0x842108421084210
5658; AVX512BW-FCP-NEXT:    kmovq %rax, %k4
5659; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm11 {%k4}
5660; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %ymm14
5661; AVX512BW-FCP-NEXT:    vpshufb %ymm6, %ymm14, %ymm6
5662; AVX512BW-FCP-NEXT:    vmovdqa64 (%rcx), %ymm18
5663; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm18, %ymm7
5664; AVX512BW-FCP-NEXT:    vpor %ymm6, %ymm7, %ymm6
5665; AVX512BW-FCP-NEXT:    vpshufb %ymm23, %ymm14, %ymm7
5666; AVX512BW-FCP-NEXT:    vpshufb %ymm24, %ymm18, %ymm19
5667; AVX512BW-FCP-NEXT:    vporq %ymm7, %ymm19, %ymm7
5668; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
5669; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
5670; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %ymm7
5671; AVX512BW-FCP-NEXT:    vpshufb %ymm15, %ymm7, %ymm15
5672; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %ymm19
5673; AVX512BW-FCP-NEXT:    vpshufb %ymm16, %ymm19, %ymm16
5674; AVX512BW-FCP-NEXT:    vporq %ymm15, %ymm16, %ymm15
5675; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3]
5676; AVX512BW-FCP-NEXT:    vpermd %ymm19, %ymm9, %ymm9
5677; AVX512BW-FCP-NEXT:    vpshufb %ymm10, %ymm7, %ymm9 {%k1}
5678; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm9, %zmm9
5679; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm6, %zmm9 {%k2}
5680; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,12,14,13,13,13,13,12,14]
5681; AVX512BW-FCP-NEXT:    vpermd %zmm13, %zmm6, %zmm6
5682; AVX512BW-FCP-NEXT:    movabsq $1190112520884487201, %rax # imm = 0x1084210842108421
5683; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
5684; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm6, %zmm9 {%k1}
5685; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
5686; AVX512BW-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
5687; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm18[27],zero,zero,ymm18[26],zero,ymm18[28],zero,ymm18[30],zero,zero,ymm18[29],zero,ymm18[31],zero
5688; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[27],zero,zero,ymm14[26],zero,ymm14[28],zero,ymm14[30],zero,zero,ymm14[29],zero,ymm14[31],zero,zero
5689; AVX512BW-FCP-NEXT:    vpor %ymm4, %ymm6, %ymm4
5690; AVX512BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm6 = [2,2,3,3,8,8,9,9]
5691; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm6, %zmm4
5692; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
5693; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm2, %xmm2
5694; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm7[26],zero,ymm7[28],zero,zero,zero,zero,ymm7[29],zero,ymm7[31],zero,zero,ymm7[30]
5695; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm19[26],zero,ymm19[28],zero,zero,ymm19[27],zero,ymm19[29],zero,ymm19[31],zero,zero,ymm19[30],zero
5696; AVX512BW-FCP-NEXT:    vpor %ymm3, %ymm5, %ymm3
5697; AVX512BW-FCP-NEXT:    vpermt2q %zmm2, %zmm6, %zmm3
5698; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm3 {%k3}
5699; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9]
5700; AVX512BW-FCP-NEXT:    vpermd %zmm0, %zmm2, %zmm0
5701; AVX512BW-FCP-NEXT:    movabsq $2380225041768974402, %rax # imm = 0x2108421084210842
5702; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
5703; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm3 {%k1}
5704; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, 128(%r9)
5705; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, 64(%r9)
5706; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, (%r9)
5707; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, 256(%r9)
5708; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, 192(%r9)
5709; AVX512BW-FCP-NEXT:    vzeroupper
5710; AVX512BW-FCP-NEXT:    retq
5711;
5712; AVX512DQ-BW-LABEL: store_i8_stride5_vf64:
5713; AVX512DQ-BW:       # %bb.0:
5714; AVX512DQ-BW-NEXT:    vmovdqa64 (%r8), %zmm2
5715; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %ymm0
5716; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12]
5717; AVX512DQ-BW-NEXT:    vpshufb %ymm8, %ymm0, %ymm3
5718; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %ymm1
5719; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
5720; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7]
5721; AVX512DQ-BW-NEXT:    movl $693250386, %eax # imm = 0x29522952
5722; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
5723; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm4, %ymm3 {%k1}
5724; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3]
5725; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdx), %xmm6
5726; AVX512DQ-BW-NEXT:    vmovdqa 32(%rcx), %xmm12
5727; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
5728; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm7 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
5729; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm4, %xmm4
5730; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
5731; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm10
5732; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %ymm4
5733; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm15 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14]
5734; AVX512DQ-BW-NEXT:    vpshufb %ymm15, %ymm4, %ymm3
5735; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm5
5736; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm5[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
5737; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7]
5738; AVX512DQ-BW-NEXT:    movl $1251232404, %eax # imm = 0x4A944A94
5739; AVX512DQ-BW-NEXT:    kmovd %eax, %k5
5740; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm9, %ymm3 {%k5}
5741; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3]
5742; AVX512DQ-BW-NEXT:    vmovdqa 32(%rsi), %xmm13
5743; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %xmm14
5744; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
5745; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
5746; AVX512DQ-BW-NEXT:    vpshufb %xmm9, %xmm11, %xmm11
5747; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,1,1]
5748; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm11, %zmm3, %zmm3
5749; AVX512DQ-BW-NEXT:    movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631
5750; AVX512DQ-BW-NEXT:    kmovq %rax, %k4
5751; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm10, %zmm3 {%k4}
5752; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r8), %ymm16
5753; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [6,6,6,6,7,7,7,7,16,16,16,16,16,16,17,17]
5754; AVX512DQ-BW-NEXT:    vpermi2d %zmm16, %zmm2, %zmm10
5755; AVX512DQ-BW-NEXT:    movabsq $2380225041768974402, %rax # imm = 0x2108421084210842
5756; AVX512DQ-BW-NEXT:    kmovq %rax, %k2
5757; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm10, %zmm3 {%k2}
5758; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdx), %ymm23
5759; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128]
5760; AVX512DQ-BW-NEXT:    vpshufb %ymm10, %ymm23, %ymm17
5761; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rcx), %ymm24
5762; AVX512DQ-BW-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
5763; AVX512DQ-BW-NEXT:    vpshufb %ymm11, %ymm24, %ymm18
5764; AVX512DQ-BW-NEXT:    vporq %ymm17, %ymm18, %ymm17
5765; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm20 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12]
5766; AVX512DQ-BW-NEXT:    vpshufb %xmm20, %xmm12, %xmm12
5767; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm22 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
5768; AVX512DQ-BW-NEXT:    vpshufb %xmm22, %xmm6, %xmm6
5769; AVX512DQ-BW-NEXT:    vpor %xmm6, %xmm12, %xmm6
5770; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
5771; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm17, %zmm6, %zmm6
5772; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
5773; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm14, %xmm12
5774; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} xmm21 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
5775; AVX512DQ-BW-NEXT:    vpshufb %xmm21, %xmm13, %xmm13
5776; AVX512DQ-BW-NEXT:    vpor %xmm12, %xmm13, %xmm12
5777; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm14 = ymm12[0,0,1,1]
5778; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdi), %ymm25
5779; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [3,3,3,0,4,4,4,4]
5780; AVX512DQ-BW-NEXT:    vpermd %ymm25, %ymm12, %ymm17
5781; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rsi), %ymm26
5782; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm13 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
5783; AVX512DQ-BW-NEXT:    movl $138547332, %eax # imm = 0x8421084
5784; AVX512DQ-BW-NEXT:    kmovd %eax, %k3
5785; AVX512DQ-BW-NEXT:    vpshufb %ymm13, %ymm26, %ymm17 {%k3}
5786; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm17, %zmm14, %zmm14
5787; AVX512DQ-BW-NEXT:    movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
5788; AVX512DQ-BW-NEXT:    kmovq %rax, %k2
5789; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm14, %zmm6 {%k2}
5790; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [3,3,3,3,0,4,4,4]
5791; AVX512DQ-BW-NEXT:    vpermd %ymm16, %ymm14, %ymm17
5792; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm18 = mem[1,1,2,2]
5793; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm18 = ymm18[0,1,1,1]
5794; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm17, %zmm18, %zmm17
5795; AVX512DQ-BW-NEXT:    movabsq $4760450083537948804, %rax # imm = 0x4210842108421084
5796; AVX512DQ-BW-NEXT:    kmovq %rax, %k6
5797; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm17, %zmm6 {%k6}
5798; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm17 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128]
5799; AVX512DQ-BW-NEXT:    # ymm17 = mem[0,1,2,3,0,1,2,3]
5800; AVX512DQ-BW-NEXT:    vpshufb %ymm17, %ymm26, %ymm27
5801; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
5802; AVX512DQ-BW-NEXT:    vpshufb %ymm18, %ymm25, %ymm28
5803; AVX512DQ-BW-NEXT:    vporq %ymm27, %ymm28, %ymm27
5804; AVX512DQ-BW-NEXT:    vpshufb %ymm15, %ymm26, %ymm15
5805; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} ymm25 = ymm25[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
5806; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm25 = ymm25[2,2,3,3,6,6,7,7]
5807; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm25, %ymm15 {%k5}
5808; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm15, %zmm27, %zmm15
5809; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} ymm25 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
5810; AVX512DQ-BW-NEXT:    vpshufb %ymm25, %ymm23, %ymm26
5811; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm27 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
5812; AVX512DQ-BW-NEXT:    # ymm27 = mem[0,1,2,3,0,1,2,3]
5813; AVX512DQ-BW-NEXT:    vpshufb %ymm27, %ymm24, %ymm28
5814; AVX512DQ-BW-NEXT:    vporq %ymm26, %ymm28, %ymm26
5815; AVX512DQ-BW-NEXT:    vpshufb %ymm8, %ymm24, %ymm8
5816; AVX512DQ-BW-NEXT:    vpshufhw {{.*#+}} ymm23 = ymm23[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
5817; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm23 = ymm23[2,2,3,3,6,6,7,7]
5818; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm23, %ymm8 {%k1}
5819; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm26, %zmm8
5820; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm15 = zmm15[2,2,3,3,6,6,7,7]
5821; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[2,2,3,3,6,6,7,7]
5822; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm15, %zmm8 {%k4}
5823; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm15 = [4,6,5,5,5,5,4,6,6,6,6,6,7,7,7,7]
5824; AVX512DQ-BW-NEXT:    vpermd %zmm16, %zmm15, %zmm15
5825; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdx), %xmm16
5826; AVX512DQ-BW-NEXT:    movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108
5827; AVX512DQ-BW-NEXT:    kmovq %rax, %k1
5828; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm15, %zmm8 {%k1}
5829; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %xmm15
5830; AVX512DQ-BW-NEXT:    vpshufb %xmm20, %xmm15, %xmm20
5831; AVX512DQ-BW-NEXT:    vpshufb %xmm22, %xmm16, %xmm22
5832; AVX512DQ-BW-NEXT:    vporq %xmm20, %xmm22, %xmm20
5833; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7]
5834; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm15, %xmm7
5835; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %xmm15
5836; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm20, %zmm7, %zmm7
5837; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %xmm16
5838; AVX512DQ-BW-NEXT:    vpshufb %xmm19, %xmm16, %xmm19
5839; AVX512DQ-BW-NEXT:    vpshufb %xmm21, %xmm15, %xmm20
5840; AVX512DQ-BW-NEXT:    vporq %xmm19, %xmm20, %xmm19
5841; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7]
5842; AVX512DQ-BW-NEXT:    vpshufb %xmm9, %xmm15, %xmm9
5843; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm19, %zmm9, %zmm9
5844; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm7 = zmm7[0,0,1,1,4,4,5,5]
5845; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm9 = zmm9[0,0,1,1,4,4,5,5]
5846; AVX512DQ-BW-NEXT:    movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C
5847; AVX512DQ-BW-NEXT:    kmovq %rax, %k1
5848; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm7, %zmm9 {%k1}
5849; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2]
5850; AVX512DQ-BW-NEXT:    vpermd %zmm2, %zmm7, %zmm2
5851; AVX512DQ-BW-NEXT:    movabsq $595056260442243600, %rax # imm = 0x842108421084210
5852; AVX512DQ-BW-NEXT:    kmovq %rax, %k1
5853; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm2, %zmm9 {%k1}
5854; AVX512DQ-BW-NEXT:    vpshufb %ymm10, %ymm1, %ymm2
5855; AVX512DQ-BW-NEXT:    vpshufb %ymm11, %ymm0, %ymm7
5856; AVX512DQ-BW-NEXT:    vpor %ymm2, %ymm7, %ymm2
5857; AVX512DQ-BW-NEXT:    vpshufb %ymm25, %ymm1, %ymm1
5858; AVX512DQ-BW-NEXT:    vpshufb %ymm27, %ymm0, %ymm0
5859; AVX512DQ-BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
5860; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
5861; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
5862; AVX512DQ-BW-NEXT:    vpermd %ymm5, %ymm12, %ymm1
5863; AVX512DQ-BW-NEXT:    vpshufb %ymm13, %ymm4, %ymm1 {%k3}
5864; AVX512DQ-BW-NEXT:    vpshufb %ymm17, %ymm4, %ymm2
5865; AVX512DQ-BW-NEXT:    vpshufb %ymm18, %ymm5, %ymm4
5866; AVX512DQ-BW-NEXT:    vpor %ymm2, %ymm4, %ymm2
5867; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
5868; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
5869; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k2}
5870; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %ymm0
5871; AVX512DQ-BW-NEXT:    vpermd %ymm0, %ymm14, %ymm2
5872; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,1,1,4,6,5,5]
5873; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,3,2]
5874; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
5875; AVX512DQ-BW-NEXT:    movabsq $1190112520884487201, %rax # imm = 0x1084210842108421
5876; AVX512DQ-BW-NEXT:    kmovq %rax, %k1
5877; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
5878; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, 64(%r9)
5879; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, (%r9)
5880; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, 256(%r9)
5881; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, 192(%r9)
5882; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, 128(%r9)
5883; AVX512DQ-BW-NEXT:    vzeroupper
5884; AVX512DQ-BW-NEXT:    retq
5885;
5886; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf64:
5887; AVX512DQ-BW-FCP:       # %bb.0:
5888; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm0
5889; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdx), %ymm8
5890; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128]
5891; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm1
5892; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rcx), %ymm21
5893; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
5894; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm21, %ymm2
5895; AVX512DQ-BW-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm1
5896; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rcx), %xmm11
5897; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rcx), %xmm2
5898; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12]
5899; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm14, %xmm2, %xmm3
5900; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %xmm12
5901; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdx), %xmm4
5902; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
5903; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm4, %xmm5
5904; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm5, %xmm3
5905; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
5906; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
5907; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm13
5908; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm3
5909; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
5910; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm19, %xmm3, %xmm9
5911; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm18
5912; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rsi), %xmm5
5913; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm20 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
5914; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm20, %xmm5, %xmm10
5915; AVX512DQ-BW-FCP-NEXT:    vpor %xmm9, %xmm10, %xmm9
5916; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm9[0,0,1,1]
5917; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdi), %ymm16
5918; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4]
5919; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm16, %ymm9, %ymm22
5920; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rsi), %ymm23
5921; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
5922; AVX512DQ-BW-FCP-NEXT:    movl $138547332, %eax # imm = 0x8421084
5923; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
5924; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm10, %ymm23, %ymm22 {%k1}
5925; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm15, %zmm15
5926; AVX512DQ-BW-FCP-NEXT:    movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
5927; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k2
5928; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm15, %zmm1 {%k2}
5929; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm15 = [9,9,10,10,10,10,10,10,11,11,11,11,0,12,12,12]
5930; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm0, %zmm15, %zmm15
5931; AVX512DQ-BW-FCP-NEXT:    movabsq $4760450083537948804, %rax # imm = 0x4210842108421084
5932; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k3
5933; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm15, %zmm1 {%k3}
5934; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],mem[4,5,6,7]
5935; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30]
5936; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm15, %zmm22, %zmm22
5937; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm16[0,1,2,3],mem[4,5,6,7]
5938; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
5939; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm16, %zmm23, %zmm23
5940; AVX512DQ-BW-FCP-NEXT:    vporq %zmm22, %zmm23, %zmm22
5941; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm22 = zmm22[2,2,3,3,6,6,7,7]
5942; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],mem[4,5,6,7]
5943; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
5944; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm23, %zmm8, %zmm8
5945; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],mem[4,5,6,7]
5946; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
5947; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm24, %zmm21, %zmm21
5948; AVX512DQ-BW-FCP-NEXT:    vporq %zmm8, %zmm21, %zmm8
5949; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[2,2,3,3,6,6,7,7]
5950; AVX512DQ-BW-FCP-NEXT:    movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631
5951; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k3
5952; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm22, %zmm8 {%k3}
5953; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm21 = [12,14,13,13,13,13,12,14,14,14,14,14,15,15,15,15]
5954; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm0, %zmm21, %zmm21
5955; AVX512DQ-BW-FCP-NEXT:    movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108
5956; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k4
5957; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm21, %zmm8 {%k4}
5958; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm14, %xmm11, %xmm14
5959; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm12, %xmm17
5960; AVX512DQ-BW-FCP-NEXT:    vporq %xmm14, %xmm17, %xmm14
5961; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
5962; AVX512DQ-BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
5963; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm12, %xmm11, %xmm11
5964; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm14, %zmm11, %zmm11
5965; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm14 = zmm11[0,0,1,1,4,4,5,5]
5966; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm19, %xmm13, %xmm11
5967; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm20, %xmm18, %xmm17
5968; AVX512DQ-BW-FCP-NEXT:    vporq %xmm11, %xmm17, %xmm11
5969; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm18[0],xmm13[1],xmm18[1],xmm13[2],xmm18[2],xmm13[3],xmm18[3],xmm13[4],xmm18[4],xmm13[5],xmm18[5],xmm13[6],xmm18[6],xmm13[7],xmm18[7]
5970; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
5971; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm13, %xmm13
5972; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm11, %zmm13, %zmm11
5973; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm11 = zmm11[0,0,1,1,4,4,5,5]
5974; AVX512DQ-BW-FCP-NEXT:    movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C
5975; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k4
5976; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm11 {%k4}
5977; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3]
5978; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10]
5979; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm13, %zmm14, %zmm14
5980; AVX512DQ-BW-FCP-NEXT:    movabsq $595056260442243600, %rax # imm = 0x842108421084210
5981; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k4
5982; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm11 {%k4}
5983; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %ymm14
5984; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm6, %ymm14, %ymm6
5985; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rcx), %ymm18
5986; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm18, %ymm7
5987; AVX512DQ-BW-FCP-NEXT:    vpor %ymm6, %ymm7, %ymm6
5988; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm23, %ymm14, %ymm7
5989; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm24, %ymm18, %ymm19
5990; AVX512DQ-BW-FCP-NEXT:    vporq %ymm7, %ymm19, %ymm7
5991; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
5992; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
5993; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %ymm7
5994; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm15, %ymm7, %ymm15
5995; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %ymm19
5996; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm16, %ymm19, %ymm16
5997; AVX512DQ-BW-FCP-NEXT:    vporq %ymm15, %ymm16, %ymm15
5998; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3]
5999; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm19, %ymm9, %ymm9
6000; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm10, %ymm7, %ymm9 {%k1}
6001; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm9, %zmm9
6002; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm6, %zmm9 {%k2}
6003; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,12,14,13,13,13,13,12,14]
6004; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm13, %zmm6, %zmm6
6005; AVX512DQ-BW-FCP-NEXT:    movabsq $1190112520884487201, %rax # imm = 0x1084210842108421
6006; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
6007; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm6, %zmm9 {%k1}
6008; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
6009; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
6010; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm18[27],zero,zero,ymm18[26],zero,ymm18[28],zero,ymm18[30],zero,zero,ymm18[29],zero,ymm18[31],zero
6011; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[27],zero,zero,ymm14[26],zero,ymm14[28],zero,ymm14[30],zero,zero,ymm14[29],zero,ymm14[31],zero,zero
6012; AVX512DQ-BW-FCP-NEXT:    vpor %ymm4, %ymm6, %ymm4
6013; AVX512DQ-BW-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm6 = [2,2,3,3,8,8,9,9]
6014; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm6, %zmm4
6015; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
6016; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm17, %xmm2, %xmm2
6017; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm7[26],zero,ymm7[28],zero,zero,zero,zero,ymm7[29],zero,ymm7[31],zero,zero,ymm7[30]
6018; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm19[26],zero,ymm19[28],zero,zero,ymm19[27],zero,ymm19[29],zero,ymm19[31],zero,zero,ymm19[30],zero
6019; AVX512DQ-BW-FCP-NEXT:    vpor %ymm3, %ymm5, %ymm3
6020; AVX512DQ-BW-FCP-NEXT:    vpermt2q %zmm2, %zmm6, %zmm3
6021; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm3 {%k3}
6022; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9]
6023; AVX512DQ-BW-FCP-NEXT:    vpermd %zmm0, %zmm2, %zmm0
6024; AVX512DQ-BW-FCP-NEXT:    movabsq $2380225041768974402, %rax # imm = 0x2108421084210842
6025; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
6026; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm3 {%k1}
6027; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, 128(%r9)
6028; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, 64(%r9)
6029; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, (%r9)
6030; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, 256(%r9)
6031; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, 192(%r9)
6032; AVX512DQ-BW-FCP-NEXT:    vzeroupper
6033; AVX512DQ-BW-FCP-NEXT:    retq
6034  %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64
6035  %in.vec1 = load <64 x i8>, ptr %in.vecptr1, align 64
6036  %in.vec2 = load <64 x i8>, ptr %in.vecptr2, align 64
6037  %in.vec3 = load <64 x i8>, ptr %in.vecptr3, align 64
6038  %in.vec4 = load <64 x i8>, ptr %in.vecptr4, align 64
6039  %1 = shufflevector <64 x i8> %in.vec0, <64 x i8> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
6040  %2 = shufflevector <64 x i8> %in.vec2, <64 x i8> %in.vec3, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
6041  %3 = shufflevector <128 x i8> %1, <128 x i8> %2, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255>
6042  %4 = shufflevector <64 x i8> %in.vec4, <64 x i8> poison, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
6043  %5 = shufflevector <256 x i8> %3, <256 x i8> %4, <320 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255, i32 256, i32 257, i32 258, i32 259, i32 260, i32 261, i32 262, i32 263, i32 264, i32 265, i32 266, i32 267, i32 268, i32 269, i32 270, i32 271, i32 272, i32 273, i32 274, i32 275, i32 276, i32 277, i32 278, i32 279, i32 280, i32 281, i32 282, i32 283, i32 284, i32 285, i32 286, i32 287, i32 288, i32 289, i32 290, i32 291, i32 292, i32 293, i32 294, i32 295, i32 296, i32 297, i32 298, i32 299, i32 300, i32 301, i32 302, i32 303, i32 304, i32 305, i32 306, i32 307, i32 308, i32 309, i32 310, i32 311, i32 312, i32 313, i32 314, i32 315, i32 316, i32 317, i32 318, i32 319>
6044  %interleaved.vec = shufflevector <320 x i8> %5, <320 x i8> poison, <320 x i32> <i32 0, i32 64, i32 128, i32 192, i32 256, i32 1, i32 65, i32 129, i32 193, i32 257, i32 2, i32 66, i32 130, i32 194, i32 258, i32 3, i32 67, i32 131, i32 195, i32 259, i32 4, i32 68, i32 132, i32 196, i32 260, i32 5, i32 69, i32 133, i32 197, i32 261, i32 6, i32 70, i32 134, i32 198, i32 262, i32 7, i32 71, i32 135, i32 199, i32 263, i32 8, i32 72, i32 136, i32 200, i32 264, i32 9, i32 73, i32 137, i32 201, i32 265, i32 10, i32 74, i32 138, i32 202, i32 266, i32 11, i32 75, i32 139, i32 203, i32 267, i32 12, i32 76, i32 140, i32 204, i32 268, i32 13, i32 77, i32 141, i32 205, i32 269, i32 14, i32 78, i32 142, i32 206, i32 270, i32 15, i32 79, i32 143, i32 207, i32 271, i32 16, i32 80, i32 144, i32 208, i32 272, i32 17, i32 81, i32 145, i32 209, i32 273, i32 18, i32 82, i32 146, i32 210, i32 274, i32 19, i32 83, i32 147, i32 211, i32 275, i32 20, i32 84, i32 148, i32 212, i32 276, i32 21, i32 85, i32 149, i32 213, i32 277, i32 22, i32 86, i32 150, i32 214, i32 278, i32 23, i32 87, i32 151, i32 215, i32 279, i32 24, i32 88, i32 152, i32 216, i32 280, i32 25, i32 89, i32 153, i32 217, i32 281, i32 26, i32 90, i32 154, i32 218, i32 282, i32 27, i32 91, i32 155, i32 219, i32 283, i32 28, i32 92, i32 156, i32 220, i32 284, i32 29, i32 93, i32 157, i32 221, i32 285, i32 30, i32 94, i32 158, i32 222, i32 286, i32 31, i32 95, i32 159, i32 223, i32 287, i32 32, i32 96, i32 160, i32 224, i32 288, i32 33, i32 97, i32 161, i32 225, i32 289, i32 34, i32 98, i32 162, i32 226, i32 290, i32 35, i32 99, i32 163, i32 227, i32 291, i32 36, i32 100, i32 164, i32 228, i32 292, i32 37, i32 101, i32 165, i32 229, i32 293, i32 38, i32 102, i32 166, i32 230, i32 294, i32 39, i32 103, i32 167, i32 231, i32 295, i32 40, i32 104, i32 168, i32 232, i32 296, i32 41, i32 105, i32 169, i32 233, i32 297, i32 42, i32 106, i32 170, i32 234, i32 298, i32 43, i32 107, i32 171, i32 235, i32 299, i32 44, i32 108, i32 172, i32 236, i32 300, i32 45, i32 109, i32 173, i32 237, i32 301, i32 46, i32 110, i32 174, i32 238, i32 302, i32 47, i32 111, i32 175, i32 239, i32 303, i32 48, i32 112, i32 176, i32 240, i32 304, i32 49, i32 113, i32 177, i32 241, i32 305, i32 50, i32 114, i32 178, i32 242, i32 306, i32 51, i32 115, i32 179, i32 243, i32 307, i32 52, i32 116, i32 180, i32 244, i32 308, i32 53, i32 117, i32 181, i32 245, i32 309, i32 54, i32 118, i32 182, i32 246, i32 310, i32 55, i32 119, i32 183, i32 247, i32 311, i32 56, i32 120, i32 184, i32 248, i32 312, i32 57, i32 121, i32 185, i32 249, i32 313, i32 58, i32 122, i32 186, i32 250, i32 314, i32 59, i32 123, i32 187, i32 251, i32 315, i32 60, i32 124, i32 188, i32 252, i32 316, i32 61, i32 125, i32 189, i32 253, i32 317, i32 62, i32 126, i32 190, i32 254, i32 318, i32 63, i32 127, i32 191, i32 255, i32 319>
6045  store <320 x i8> %interleaved.vec, ptr %out.vec, align 64
6046  ret void
6047}
6048