xref: /llvm-project/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll (revision 401d123a1fdcbbf4ae7a20178957b7e3a625c044)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx  | FileCheck %s --check-prefixes=AVX
4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
15
16; These patterns are produced by LoopVectorizer for interleaved stores.
17
18define void @store_i16_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind {
19; SSE-LABEL: store_i16_stride6_vf2:
20; SSE:       # %bb.0:
21; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
22; SSE-NEXT:    movdqa (%rdi), %xmm0
23; SSE-NEXT:    movdqa (%rdx), %xmm1
24; SSE-NEXT:    movdqa (%r8), %xmm2
25; SSE-NEXT:    movdqa (%r9), %xmm3
26; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
27; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
28; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
29; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
30; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5,4,6]
31; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
32; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
33; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[3,3]
34; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
35; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2]
36; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
37; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,0,3,3,4,5,6,7]
38; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
39; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
40; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
41; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
42; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,2,0,3,4,5,6,7]
43; SSE-NEXT:    movaps %xmm0, (%rax)
44; SSE-NEXT:    movq %xmm1, 16(%rax)
45; SSE-NEXT:    retq
46;
47; AVX-LABEL: store_i16_stride6_vf2:
48; AVX:       # %bb.0:
49; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
50; AVX-NEXT:    vmovdqa (%rdi), %xmm0
51; AVX-NEXT:    vmovdqa (%rdx), %xmm1
52; AVX-NEXT:    vmovdqa (%r8), %xmm2
53; AVX-NEXT:    vmovdqa (%r9), %xmm3
54; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
55; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
56; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
57; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
58; AVX-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,5,7,6,7]
59; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2]
60; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
61; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
62; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7]
63; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
64; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
65; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7]
66; AVX-NEXT:    vmovdqa %xmm0, (%rax)
67; AVX-NEXT:    vmovq %xmm1, 16(%rax)
68; AVX-NEXT:    retq
69;
70; AVX2-LABEL: store_i16_stride6_vf2:
71; AVX2:       # %bb.0:
72; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
73; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
74; AVX2-NEXT:    vmovdqa (%rdx), %xmm1
75; AVX2-NEXT:    vmovdqa (%r8), %xmm2
76; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
77; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
78; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
79; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
80; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
81; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u]
82; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
83; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
84; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
85; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
86; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
87; AVX2-NEXT:    vmovq %xmm1, 16(%rax)
88; AVX2-NEXT:    vmovdqa %xmm0, (%rax)
89; AVX2-NEXT:    vzeroupper
90; AVX2-NEXT:    retq
91;
92; AVX2-FP-LABEL: store_i16_stride6_vf2:
93; AVX2-FP:       # %bb.0:
94; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
95; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
96; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm1
97; AVX2-FP-NEXT:    vmovdqa (%r8), %xmm2
98; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
99; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
100; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
101; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
102; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
103; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,2,3,6,7,18,19,22,23,2,3,6,7,u,u,u,u,u,u,u,u]
104; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm0, %ymm2
105; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
106; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
107; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6,7]
108; AVX2-FP-NEXT:    vextracti128 $1, %ymm0, %xmm1
109; AVX2-FP-NEXT:    vmovq %xmm1, 16(%rax)
110; AVX2-FP-NEXT:    vmovdqa %xmm0, (%rax)
111; AVX2-FP-NEXT:    vzeroupper
112; AVX2-FP-NEXT:    retq
113;
114; AVX2-FCP-LABEL: store_i16_stride6_vf2:
115; AVX2-FCP:       # %bb.0:
116; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
117; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm0
118; AVX2-FCP-NEXT:    vmovdqa (%rdx), %xmm1
119; AVX2-FCP-NEXT:    vmovdqa (%r8), %xmm2
120; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
121; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
122; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
123; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
124; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
125; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,2,3,6,7,18,19,22,23,2,3,6,7,u,u,u,u,u,u,u,u]
126; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm2
127; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
128; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
129; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6,7]
130; AVX2-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
131; AVX2-FCP-NEXT:    vmovq %xmm1, 16(%rax)
132; AVX2-FCP-NEXT:    vmovdqa %xmm0, (%rax)
133; AVX2-FCP-NEXT:    vzeroupper
134; AVX2-FCP-NEXT:    retq
135;
136; AVX512-LABEL: store_i16_stride6_vf2:
137; AVX512:       # %bb.0:
138; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
139; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
140; AVX512-NEXT:    vmovdqa (%rdx), %xmm1
141; AVX512-NEXT:    vmovdqa (%r8), %xmm2
142; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
143; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
144; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
145; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
146; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
147; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u]
148; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
149; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
150; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
151; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
152; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
153; AVX512-NEXT:    vmovq %xmm1, 16(%rax)
154; AVX512-NEXT:    vmovdqa %xmm0, (%rax)
155; AVX512-NEXT:    vzeroupper
156; AVX512-NEXT:    retq
157;
158; AVX512-FCP-LABEL: store_i16_stride6_vf2:
159; AVX512-FCP:       # %bb.0:
160; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
161; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
162; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm1
163; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm2
164; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
165; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
166; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
167; AVX512-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
168; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
169; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,2,3,6,7,18,19,22,23,2,3,6,7,u,u,u,u,u,u,u,u]
170; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm2
171; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
172; AVX512-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
173; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6,7]
174; AVX512-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
175; AVX512-FCP-NEXT:    vmovq %xmm1, 16(%rax)
176; AVX512-FCP-NEXT:    vmovdqa %xmm0, (%rax)
177; AVX512-FCP-NEXT:    vzeroupper
178; AVX512-FCP-NEXT:    retq
179;
180; AVX512DQ-LABEL: store_i16_stride6_vf2:
181; AVX512DQ:       # %bb.0:
182; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
183; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
184; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm1
185; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm2
186; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
187; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
188; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
189; AVX512DQ-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
190; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
191; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u]
192; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
193; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
194; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
195; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
196; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
197; AVX512DQ-NEXT:    vmovq %xmm1, 16(%rax)
198; AVX512DQ-NEXT:    vmovdqa %xmm0, (%rax)
199; AVX512DQ-NEXT:    vzeroupper
200; AVX512DQ-NEXT:    retq
201;
202; AVX512DQ-FCP-LABEL: store_i16_stride6_vf2:
203; AVX512DQ-FCP:       # %bb.0:
204; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
205; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
206; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm1
207; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm2
208; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
209; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
210; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
211; AVX512DQ-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
212; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
213; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,2,3,6,7,18,19,22,23,2,3,6,7,u,u,u,u,u,u,u,u]
214; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm2
215; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
216; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
217; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6,7]
218; AVX512DQ-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
219; AVX512DQ-FCP-NEXT:    vmovq %xmm1, 16(%rax)
220; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, (%rax)
221; AVX512DQ-FCP-NEXT:    vzeroupper
222; AVX512DQ-FCP-NEXT:    retq
223;
224; AVX512BW-LABEL: store_i16_stride6_vf2:
225; AVX512BW:       # %bb.0:
226; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
227; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
228; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm1
229; AVX512BW-NEXT:    vmovdqa (%r8), %xmm2
230; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
231; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
232; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
233; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
234; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
235; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
236; AVX512BW-NEXT:    vpermw %ymm0, %ymm1, %ymm0
237; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
238; AVX512BW-NEXT:    vmovq %xmm1, 16(%rax)
239; AVX512BW-NEXT:    vmovdqa %xmm0, (%rax)
240; AVX512BW-NEXT:    vzeroupper
241; AVX512BW-NEXT:    retq
242;
243; AVX512BW-FCP-LABEL: store_i16_stride6_vf2:
244; AVX512BW-FCP:       # %bb.0:
245; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
246; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
247; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %xmm1
248; AVX512BW-FCP-NEXT:    vmovdqa (%r8), %xmm2
249; AVX512BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
250; AVX512BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
251; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
252; AVX512BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
253; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
254; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
255; AVX512BW-FCP-NEXT:    vpermw %ymm0, %ymm1, %ymm0
256; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
257; AVX512BW-FCP-NEXT:    vmovq %xmm1, 16(%rax)
258; AVX512BW-FCP-NEXT:    vmovdqa %xmm0, (%rax)
259; AVX512BW-FCP-NEXT:    vzeroupper
260; AVX512BW-FCP-NEXT:    retq
261;
262; AVX512DQ-BW-LABEL: store_i16_stride6_vf2:
263; AVX512DQ-BW:       # %bb.0:
264; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
265; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
266; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm1
267; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %xmm2
268; AVX512DQ-BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
269; AVX512DQ-BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
270; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
271; AVX512DQ-BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
272; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
273; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
274; AVX512DQ-BW-NEXT:    vpermw %ymm0, %ymm1, %ymm0
275; AVX512DQ-BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
276; AVX512DQ-BW-NEXT:    vmovq %xmm1, 16(%rax)
277; AVX512DQ-BW-NEXT:    vmovdqa %xmm0, (%rax)
278; AVX512DQ-BW-NEXT:    vzeroupper
279; AVX512DQ-BW-NEXT:    retq
280;
281; AVX512DQ-BW-FCP-LABEL: store_i16_stride6_vf2:
282; AVX512DQ-BW-FCP:       # %bb.0:
283; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
284; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
285; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %xmm1
286; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r8), %xmm2
287; AVX512DQ-BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
288; AVX512DQ-BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
289; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
290; AVX512DQ-BW-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
291; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
292; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
293; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm0, %ymm1, %ymm0
294; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm0, %xmm1
295; AVX512DQ-BW-FCP-NEXT:    vmovq %xmm1, 16(%rax)
296; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm0, (%rax)
297; AVX512DQ-BW-FCP-NEXT:    vzeroupper
298; AVX512DQ-BW-FCP-NEXT:    retq
299  %in.vec0 = load <2 x i16>, ptr %in.vecptr0, align 64
300  %in.vec1 = load <2 x i16>, ptr %in.vecptr1, align 64
301  %in.vec2 = load <2 x i16>, ptr %in.vecptr2, align 64
302  %in.vec3 = load <2 x i16>, ptr %in.vecptr3, align 64
303  %in.vec4 = load <2 x i16>, ptr %in.vecptr4, align 64
304  %in.vec5 = load <2 x i16>, ptr %in.vecptr5, align 64
305  %1 = shufflevector <2 x i16> %in.vec0, <2 x i16> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
306  %2 = shufflevector <2 x i16> %in.vec2, <2 x i16> %in.vec3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
307  %3 = shufflevector <2 x i16> %in.vec4, <2 x i16> %in.vec5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
308  %4 = shufflevector <4 x i16> %1, <4 x i16> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
309  %5 = shufflevector <4 x i16> %3, <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
310  %6 = shufflevector <8 x i16> %4, <8 x i16> %5, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
311  %interleaved.vec = shufflevector <12 x i16> %6, <12 x i16> poison, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11>
312  store <12 x i16> %interleaved.vec, ptr %out.vec, align 64
313  ret void
314}
315
316define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind {
317; SSE-LABEL: store_i16_stride6_vf4:
318; SSE:       # %bb.0:
319; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
320; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
321; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
322; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
323; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
324; SSE-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
325; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
326; SSE-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
327; SSE-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
328; SSE-NEXT:    movdqa %xmm3, %xmm4
329; SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
330; SSE-NEXT:    movdqa %xmm1, %xmm5
331; SSE-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0]
332; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,1,2,0]
333; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,4,6,7]
334; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,1],xmm0[1,3]
335; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0],xmm6[0,2]
336; SSE-NEXT:    movdqa %xmm0, %xmm6
337; SSE-NEXT:    punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm1[1]
338; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
339; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1]
340; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[0,2]
341; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[3,3]
342; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[3,1,1,3]
343; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
344; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
345; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3]
346; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
347; SSE-NEXT:    movaps %xmm3, 16(%rax)
348; SSE-NEXT:    movaps %xmm5, (%rax)
349; SSE-NEXT:    movaps %xmm0, 32(%rax)
350; SSE-NEXT:    retq
351;
352; AVX-LABEL: store_i16_stride6_vf4:
353; AVX:       # %bb.0:
354; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
355; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
356; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
357; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
358; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
359; AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
360; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm2[0],xmm1[0]
361; AVX-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
362; AVX-NEXT:    vmovq {{.*#+}} xmm5 = mem[0],zero
363; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm5[0],xmm4[0]
364; AVX-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[2,3,10,11,u,u,u,u,u,u,u,u,4,5,12,13]
365; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[0,1,1,3]
366; AVX-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
367; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7]
368; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
369; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7]
370; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
371; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
372; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11]
373; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
374; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm6[0,1,2,0]
375; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
376; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
377; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
378; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,3]
379; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
380; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
381; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
382; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
383; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm6[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
384; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7]
385; AVX-NEXT:    vmovdqa %xmm0, 32(%rax)
386; AVX-NEXT:    vmovaps %ymm1, (%rax)
387; AVX-NEXT:    vzeroupper
388; AVX-NEXT:    retq
389;
390; AVX2-LABEL: store_i16_stride6_vf4:
391; AVX2:       # %bb.0:
392; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
393; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
394; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
395; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
396; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
397; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
398; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
399; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
400; AVX2-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
401; AVX2-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
402; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm4[0],xmm3[0]
403; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,8,9,0,1,8,9,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,20,21,28,29,4,5,12,13]
404; AVX2-NEXT:    vpshufb %ymm6, %ymm2, %ymm7
405; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
406; AVX2-NEXT:    vpshufb %ymm6, %ymm2, %ymm2
407; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3,4,5],ymm2[6],ymm7[7]
408; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
409; AVX2-NEXT:    vpbroadcastq %xmm3, %ymm3
410; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
411; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
412; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
413; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
414; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
415; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
416; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
417; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
418; AVX2-NEXT:    vmovdqa %xmm0, 32(%rax)
419; AVX2-NEXT:    vmovdqa %ymm2, (%rax)
420; AVX2-NEXT:    vzeroupper
421; AVX2-NEXT:    retq
422;
423; AVX2-FP-LABEL: store_i16_stride6_vf4:
424; AVX2-FP:       # %bb.0:
425; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
426; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
427; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
428; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
429; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
430; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
431; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
432; AVX2-FP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
433; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
434; AVX2-FP-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
435; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm4[0],xmm3[0]
436; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,8,9,0,1,8,9,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,20,21,28,29,4,5,12,13]
437; AVX2-FP-NEXT:    vpshufb %ymm6, %ymm2, %ymm7
438; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
439; AVX2-FP-NEXT:    vpshufb %ymm6, %ymm2, %ymm2
440; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3,4,5],ymm2[6],ymm7[7]
441; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
442; AVX2-FP-NEXT:    vpbroadcastq %xmm3, %ymm3
443; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
444; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u]
445; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u]
446; AVX2-FP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
447; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
448; AVX2-FP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
449; AVX2-FP-NEXT:    vmovdqa %xmm0, 32(%rax)
450; AVX2-FP-NEXT:    vmovdqa %ymm2, (%rax)
451; AVX2-FP-NEXT:    vzeroupper
452; AVX2-FP-NEXT:    retq
453;
454; AVX2-FCP-LABEL: store_i16_stride6_vf4:
455; AVX2-FCP:       # %bb.0:
456; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
457; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
458; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
459; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
460; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
461; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
462; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
463; AVX2-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
464; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
465; AVX2-FCP-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
466; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm4[0],xmm3[0]
467; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,8,9,0,1,4,5,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,24,25,28,29,4,5,12,13]
468; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm7
469; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [4,6,1,3,4,6,1,3]
470; AVX2-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
471; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm8, %ymm2
472; AVX2-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm2
473; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3,4,5],ymm2[6],ymm7[7]
474; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
475; AVX2-FCP-NEXT:    vpbroadcastq %xmm3, %ymm3
476; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
477; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u]
478; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u]
479; AVX2-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
480; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
481; AVX2-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
482; AVX2-FCP-NEXT:    vmovdqa %xmm0, 32(%rax)
483; AVX2-FCP-NEXT:    vmovdqa %ymm2, (%rax)
484; AVX2-FCP-NEXT:    vzeroupper
485; AVX2-FCP-NEXT:    retq
486;
487; AVX512-LABEL: store_i16_stride6_vf4:
488; AVX512:       # %bb.0:
489; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
490; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
491; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
492; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
493; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
494; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
495; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
496; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
497; AVX512-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
498; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
499; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm3
500; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,20,21,28,29,4,5,12,13]
501; AVX512-NEXT:    vpshufb %ymm4, %ymm3, %ymm5
502; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
503; AVX512-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
504; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4,5],ymm3[6],ymm5[7]
505; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3]
506; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7]
507; AVX512-NEXT:    vpbroadcastq %xmm4, %ymm4
508; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
509; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
510; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
511; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
512; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
513; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
514; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
515; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
516; AVX512-NEXT:    vinserti32x4 $2, %xmm0, %zmm3, %zmm1
517; AVX512-NEXT:    vmovdqa %xmm0, 32(%rax)
518; AVX512-NEXT:    vmovdqa %ymm1, (%rax)
519; AVX512-NEXT:    vzeroupper
520; AVX512-NEXT:    retq
521;
522; AVX512-FCP-LABEL: store_i16_stride6_vf4:
523; AVX512-FCP:       # %bb.0:
524; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
525; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
526; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
527; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
528; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
529; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
530; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
531; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
532; AVX512-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
533; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm3[0],xmm2[0]
534; AVX512-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm5
535; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,8,9,0,1,4,5,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,24,25,28,29,4,5,12,13]
536; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm7
537; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [4,6,1,3,4,6,1,3]
538; AVX512-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
539; AVX512-FCP-NEXT:    vpermd %ymm5, %ymm8, %ymm5
540; AVX512-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
541; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4,5],ymm5[6],ymm7[7]
542; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
543; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7]
544; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm5, %ymm3
545; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u]
546; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u]
547; AVX512-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
548; AVX512-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
549; AVX512-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
550; AVX512-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm3, %zmm1
551; AVX512-FCP-NEXT:    vmovdqa %xmm0, 32(%rax)
552; AVX512-FCP-NEXT:    vmovdqa %ymm1, (%rax)
553; AVX512-FCP-NEXT:    vzeroupper
554; AVX512-FCP-NEXT:    retq
555;
556; AVX512DQ-LABEL: store_i16_stride6_vf4:
557; AVX512DQ:       # %bb.0:
558; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
559; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
560; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
561; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
562; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
563; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
564; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
565; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
566; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
567; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
568; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm3
569; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,20,21,28,29,4,5,12,13]
570; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm3, %ymm5
571; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
572; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
573; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4,5],ymm3[6],ymm5[7]
574; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3]
575; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7]
576; AVX512DQ-NEXT:    vpbroadcastq %xmm4, %ymm4
577; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
578; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
579; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
580; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
581; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
582; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
583; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
584; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
585; AVX512DQ-NEXT:    vinserti32x4 $2, %xmm0, %zmm3, %zmm1
586; AVX512DQ-NEXT:    vmovdqa %xmm0, 32(%rax)
587; AVX512DQ-NEXT:    vmovdqa %ymm1, (%rax)
588; AVX512DQ-NEXT:    vzeroupper
589; AVX512DQ-NEXT:    retq
590;
591; AVX512DQ-FCP-LABEL: store_i16_stride6_vf4:
592; AVX512DQ-FCP:       # %bb.0:
593; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
594; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
595; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
596; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
597; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
598; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
599; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
600; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
601; AVX512DQ-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
602; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm3[0],xmm2[0]
603; AVX512DQ-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm5
604; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,8,9,0,1,4,5,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,24,25,28,29,4,5,12,13]
605; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm7
606; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [4,6,1,3,4,6,1,3]
607; AVX512DQ-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
608; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm8, %ymm5
609; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
610; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4,5],ymm5[6],ymm7[7]
611; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
612; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7]
613; AVX512DQ-FCP-NEXT:    vpermi2d %ymm2, %ymm5, %ymm3
614; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u]
615; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u]
616; AVX512DQ-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
617; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
618; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
619; AVX512DQ-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm3, %zmm1
620; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, 32(%rax)
621; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, (%rax)
622; AVX512DQ-FCP-NEXT:    vzeroupper
623; AVX512DQ-FCP-NEXT:    retq
624;
625; AVX512BW-LABEL: store_i16_stride6_vf4:
626; AVX512BW:       # %bb.0:
627; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
628; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
629; AVX512BW-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
630; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
631; AVX512BW-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
632; AVX512BW-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
633; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
634; AVX512BW-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
635; AVX512BW-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
636; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
637; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
638; AVX512BW-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm0
639; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,0,0,0,0,0,0,0,0]
640; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
641; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, 32(%rax)
642; AVX512BW-NEXT:    vmovdqa %ymm0, (%rax)
643; AVX512BW-NEXT:    vzeroupper
644; AVX512BW-NEXT:    retq
645;
646; AVX512BW-FCP-LABEL: store_i16_stride6_vf4:
647; AVX512BW-FCP:       # %bb.0:
648; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
649; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
650; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
651; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
652; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
653; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
654; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
655; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
656; AVX512BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
657; AVX512BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
658; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
659; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm0
660; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,0,0,0,0,0,0,0,0]
661; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm0
662; AVX512BW-FCP-NEXT:    vextracti32x4 $2, %zmm0, 32(%rax)
663; AVX512BW-FCP-NEXT:    vmovdqa %ymm0, (%rax)
664; AVX512BW-FCP-NEXT:    vzeroupper
665; AVX512BW-FCP-NEXT:    retq
666;
667; AVX512DQ-BW-LABEL: store_i16_stride6_vf4:
668; AVX512DQ-BW:       # %bb.0:
669; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
670; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
671; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
672; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
673; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
674; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
675; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
676; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
677; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
678; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
679; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
680; AVX512DQ-BW-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm0
681; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,0,0,0,0,0,0,0,0]
682; AVX512DQ-BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
683; AVX512DQ-BW-NEXT:    vextracti32x4 $2, %zmm0, 32(%rax)
684; AVX512DQ-BW-NEXT:    vmovdqa %ymm0, (%rax)
685; AVX512DQ-BW-NEXT:    vzeroupper
686; AVX512DQ-BW-NEXT:    retq
687;
688; AVX512DQ-BW-FCP-LABEL: store_i16_stride6_vf4:
689; AVX512DQ-BW-FCP:       # %bb.0:
690; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
691; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
692; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
693; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
694; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
695; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
696; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
697; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
698; AVX512DQ-BW-FCP-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
699; AVX512DQ-BW-FCP-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
700; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
701; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm0
702; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,0,0,0,0,0,0,0,0]
703; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm0
704; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $2, %zmm0, 32(%rax)
705; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm0, (%rax)
706; AVX512DQ-BW-FCP-NEXT:    vzeroupper
707; AVX512DQ-BW-FCP-NEXT:    retq
708  %in.vec0 = load <4 x i16>, ptr %in.vecptr0, align 64
709  %in.vec1 = load <4 x i16>, ptr %in.vecptr1, align 64
710  %in.vec2 = load <4 x i16>, ptr %in.vecptr2, align 64
711  %in.vec3 = load <4 x i16>, ptr %in.vecptr3, align 64
712  %in.vec4 = load <4 x i16>, ptr %in.vecptr4, align 64
713  %in.vec5 = load <4 x i16>, ptr %in.vecptr5, align 64
714  %1 = shufflevector <4 x i16> %in.vec0, <4 x i16> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
715  %2 = shufflevector <4 x i16> %in.vec2, <4 x i16> %in.vec3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
716  %3 = shufflevector <4 x i16> %in.vec4, <4 x i16> %in.vec5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
717  %4 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
718  %5 = shufflevector <8 x i16> %3, <8 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
719  %6 = shufflevector <16 x i16> %4, <16 x i16> %5, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
720  %interleaved.vec = shufflevector <24 x i16> %6, <24 x i16> poison, <24 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23>
721  store <24 x i16> %interleaved.vec, ptr %out.vec, align 64
722  ret void
723}
724
725define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind {
726; SSE-LABEL: store_i16_stride6_vf8:
727; SSE:       # %bb.0:
728; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
729; SSE-NEXT:    movdqa (%rdi), %xmm0
730; SSE-NEXT:    movdqa (%rsi), %xmm8
731; SSE-NEXT:    movdqa (%rdx), %xmm1
732; SSE-NEXT:    movdqa (%rcx), %xmm9
733; SSE-NEXT:    movdqa (%r8), %xmm6
734; SSE-NEXT:    movdqa (%r9), %xmm5
735; SSE-NEXT:    movdqa %xmm1, %xmm4
736; SSE-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
737; SSE-NEXT:    movdqa %xmm0, %xmm7
738; SSE-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
739; SSE-NEXT:    movdqa %xmm7, %xmm10
740; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[2,3],xmm4[3,3]
741; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,7,7]
742; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[1,2],xmm2[2,3]
743; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[2,0,1,3]
744; SSE-NEXT:    movaps {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,0]
745; SSE-NEXT:    andps %xmm2, %xmm10
746; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,6,7]
747; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm3[2,1,2,3]
748; SSE-NEXT:    movaps %xmm2, %xmm3
749; SSE-NEXT:    andnps %xmm11, %xmm3
750; SSE-NEXT:    orps %xmm10, %xmm3
751; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3]
752; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
753; SSE-NEXT:    movdqa %xmm0, %xmm8
754; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,3],xmm1[3,3]
755; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm6[2,1,3,3,4,5,6,7]
756; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[1,2],xmm9[0,1]
757; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0,1,3]
758; SSE-NEXT:    andps %xmm2, %xmm8
759; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm5[0,2,2,3,4,5,6,7]
760; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
761; SSE-NEXT:    andnps %xmm9, %xmm2
762; SSE-NEXT:    orps %xmm8, %xmm2
763; SSE-NEXT:    movdqa %xmm1, %xmm10
764; SSE-NEXT:    punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm0[0]
765; SSE-NEXT:    movdqa %xmm6, %xmm8
766; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[1,3]
767; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[2,0],xmm8[0,2]
768; SSE-NEXT:    movaps {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,65535,65535]
769; SSE-NEXT:    andps %xmm8, %xmm10
770; SSE-NEXT:    movdqa %xmm5, %xmm11
771; SSE-NEXT:    pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5]
772; SSE-NEXT:    movaps %xmm8, %xmm9
773; SSE-NEXT:    andnps %xmm11, %xmm9
774; SSE-NEXT:    orps %xmm10, %xmm9
775; SSE-NEXT:    movdqa %xmm7, %xmm10
776; SSE-NEXT:    punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm4[1]
777; SSE-NEXT:    movdqa %xmm6, %xmm12
778; SSE-NEXT:    psrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
779; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[1,1],xmm4[1,1]
780; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[2,0],xmm10[0,2]
781; SSE-NEXT:    movaps {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535]
782; SSE-NEXT:    andps %xmm10, %xmm12
783; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm5[2,2,3,3]
784; SSE-NEXT:    movaps %xmm10, %xmm11
785; SSE-NEXT:    andnps %xmm13, %xmm11
786; SSE-NEXT:    orps %xmm12, %xmm11
787; SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
788; SSE-NEXT:    pshuflw {{.*#+}} xmm12 = xmm6[1,1,1,1,4,5,6,7]
789; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,1],xmm7[1,3]
790; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[0,2]
791; SSE-NEXT:    andps %xmm8, %xmm4
792; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,1,1]
793; SSE-NEXT:    pslld $16, %xmm5
794; SSE-NEXT:    andnps %xmm5, %xmm8
795; SSE-NEXT:    orps %xmm4, %xmm8
796; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
797; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[1,1],xmm1[1,1]
798; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[0,2]
799; SSE-NEXT:    andps %xmm10, %xmm12
800; SSE-NEXT:    andnps %xmm6, %xmm10
801; SSE-NEXT:    orps %xmm12, %xmm10
802; SSE-NEXT:    movaps %xmm10, 16(%rax)
803; SSE-NEXT:    movaps %xmm8, 48(%rax)
804; SSE-NEXT:    movaps %xmm11, 64(%rax)
805; SSE-NEXT:    movaps %xmm9, (%rax)
806; SSE-NEXT:    movaps %xmm2, 32(%rax)
807; SSE-NEXT:    movaps %xmm3, 80(%rax)
808; SSE-NEXT:    retq
809;
810; AVX-LABEL: store_i16_stride6_vf8:
811; AVX:       # %bb.0:
812; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
813; AVX-NEXT:    vmovdqa (%rdi), %xmm0
814; AVX-NEXT:    vmovdqa (%rsi), %xmm1
815; AVX-NEXT:    vmovdqa (%rdx), %xmm2
816; AVX-NEXT:    vmovdqa (%rcx), %xmm3
817; AVX-NEXT:    vmovdqa (%r8), %xmm4
818; AVX-NEXT:    vmovdqa (%r9), %xmm5
819; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
820; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
821; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2]
822; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm6[4,5],xmm8[6,7]
823; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
824; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7]
825; AVX-NEXT:    vpshufd {{.*#+}} xmm10 = xmm7[0,0,1,1]
826; AVX-NEXT:    vpshufd {{.*#+}} xmm11 = xmm6[0,1,0,1]
827; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7]
828; AVX-NEXT:    vpshufd {{.*#+}} xmm11 = xmm9[0,1,0,1]
829; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4,5],xmm10[6,7]
830; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm10, %ymm8
831; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
832; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
833; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
834; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
835; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7]
836; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
837; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[0,1,0,1]
838; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5],xmm1[6,7]
839; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm7[2,2,3,3]
840; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm6[1],xmm4[1]
841; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm9[2,3,2,3]
842; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7]
843; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1
844; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,2,3,3]
845; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm4 = xmm0[1],xmm4[1]
846; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3]
847; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7]
848; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2]
849; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
850; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7]
851; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
852; AVX-NEXT:    vmovaps %ymm0, 64(%rax)
853; AVX-NEXT:    vmovaps %ymm1, 32(%rax)
854; AVX-NEXT:    vmovaps %ymm8, (%rax)
855; AVX-NEXT:    vzeroupper
856; AVX-NEXT:    retq
857;
858; AVX2-LABEL: store_i16_stride6_vf8:
859; AVX2:       # %bb.0:
860; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
861; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
862; AVX2-NEXT:    vmovdqa (%rdx), %xmm1
863; AVX2-NEXT:    vmovdqa (%r8), %xmm2
864; AVX2-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
865; AVX2-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
866; AVX2-NEXT:    vinserti128 $1, (%r9), %ymm2, %ymm2
867; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2]
868; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29]
869; AVX2-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
870; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm0[0,2,0,2]
871; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
872; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
873; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2]
874; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u]
875; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
876; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm1[0,2,1,3]
877; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,8,9,6,7,14,15,6,7,14,15,2,3,10,11,16,17,24,25,16,17,24,25,24,25,26,27,18,19,26,27]
878; AVX2-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
879; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm0[0,2,1,3]
880; AVX2-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
881; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
882; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3]
883; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u]
884; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
885; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
886; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [2,3,10,11,8,9,10,11,4,5,12,13,4,5,12,13,18,19,26,27,22,23,30,31,22,23,30,31,20,21,28,29]
887; AVX2-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
888; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3]
889; AVX2-NEXT:    vpshufb %ymm5, %ymm1, %ymm1
890; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
891; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3]
892; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31]
893; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
894; AVX2-NEXT:    vmovdqa %ymm0, 64(%rax)
895; AVX2-NEXT:    vmovdqa %ymm4, 32(%rax)
896; AVX2-NEXT:    vmovdqa %ymm3, (%rax)
897; AVX2-NEXT:    vzeroupper
898; AVX2-NEXT:    retq
899;
900; AVX2-FP-LABEL: store_i16_stride6_vf8:
901; AVX2-FP:       # %bb.0:
902; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
903; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
904; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm1
905; AVX2-FP-NEXT:    vmovdqa (%r8), %xmm2
906; AVX2-FP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
907; AVX2-FP-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
908; AVX2-FP-NEXT:    vinserti128 $1, (%r9), %ymm2, %ymm2
909; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2]
910; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29]
911; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
912; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm0[0,2,0,2]
913; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
914; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
915; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2]
916; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u]
917; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
918; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm1[0,2,1,3]
919; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,8,9,6,7,14,15,6,7,14,15,2,3,10,11,16,17,24,25,16,17,24,25,24,25,26,27,18,19,26,27]
920; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
921; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm0[0,2,1,3]
922; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
923; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
924; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3]
925; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u]
926; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
927; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
928; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm5 = [2,3,10,11,8,9,10,11,4,5,12,13,4,5,12,13,18,19,26,27,22,23,30,31,22,23,30,31,20,21,28,29]
929; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
930; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3]
931; AVX2-FP-NEXT:    vpshufb %ymm5, %ymm1, %ymm1
932; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
933; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3]
934; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31]
935; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
936; AVX2-FP-NEXT:    vmovdqa %ymm0, 64(%rax)
937; AVX2-FP-NEXT:    vmovdqa %ymm4, 32(%rax)
938; AVX2-FP-NEXT:    vmovdqa %ymm3, (%rax)
939; AVX2-FP-NEXT:    vzeroupper
940; AVX2-FP-NEXT:    retq
941;
942; AVX2-FCP-LABEL: store_i16_stride6_vf8:
943; AVX2-FCP:       # %bb.0:
944; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
945; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm0
946; AVX2-FCP-NEXT:    vmovdqa (%rdx), %xmm1
947; AVX2-FCP-NEXT:    vmovdqa (%r8), %xmm2
948; AVX2-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
949; AVX2-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
950; AVX2-FCP-NEXT:    vinserti128 $1, (%r9), %ymm2, %ymm2
951; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,4,1,5,0,4,1,5]
952; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
953; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm3
954; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,4,5,4,5,12,13,2,3,10,11,18,19,22,23,24,25,30,31,20,21,28,29,24,25,28,29]
955; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
956; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm0[0,2,0,2]
957; AVX2-FCP-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
958; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
959; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,4,4,0,0,4,4,0]
960; AVX2-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
961; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm4, %ymm4
962; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,12,13,8,9,u,u,u,u,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u]
963; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
964; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [2,6,1,5,2,6,1,5]
965; AVX2-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
966; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm4, %ymm4
967; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,8,9,6,7,14,15,10,11,14,15,2,3,10,11,16,17,24,25,16,17,20,21,24,25,26,27,18,19,26,27]
968; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
969; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm0[0,2,1,3]
970; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
971; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
972; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3]
973; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u]
974; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
975; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [7,3,3,7,7,3,3,7]
976; AVX2-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
977; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm5, %ymm0
978; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [2,3,10,11,8,9,10,11,8,9,12,13,4,5,12,13,18,19,26,27,22,23,18,19,22,23,30,31,20,21,28,29]
979; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
980; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3]
981; AVX2-FCP-NEXT:    vpshufb %ymm5, %ymm1, %ymm1
982; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
983; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3]
984; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31]
985; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
986; AVX2-FCP-NEXT:    vmovdqa %ymm0, 64(%rax)
987; AVX2-FCP-NEXT:    vmovdqa %ymm4, 32(%rax)
988; AVX2-FCP-NEXT:    vmovdqa %ymm3, (%rax)
989; AVX2-FCP-NEXT:    vzeroupper
990; AVX2-FCP-NEXT:    retq
991;
992; AVX512-LABEL: store_i16_stride6_vf8:
993; AVX512:       # %bb.0:
994; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
995; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
996; AVX512-NEXT:    vmovdqa (%rdx), %xmm1
997; AVX512-NEXT:    vmovdqa (%r8), %xmm2
998; AVX512-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
999; AVX512-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
1000; AVX512-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm2
1001; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[0,2,1,3]
1002; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,6,7,14,15,2,3,10,11,16,17,24,25,16,17,24,25,24,25,26,27,18,19,26,27]
1003; AVX512-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
1004; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3]
1005; AVX512-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
1006; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1007; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3]
1008; AVX512-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u]
1009; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
1010; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm1[0,2,0,2]
1011; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29]
1012; AVX512-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
1013; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm0[0,2,0,2]
1014; AVX512-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
1015; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
1016; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm2[0,2,0,2]
1017; AVX512-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u]
1018; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
1019; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
1020; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
1021; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,3,10,11,8,9,10,11,4,5,12,13,4,5,12,13,18,19,26,27,22,23,30,31,22,23,30,31,20,21,28,29]
1022; AVX512-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
1023; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3]
1024; AVX512-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
1025; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
1026; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3]
1027; AVX512-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31]
1028; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1029; AVX512-NEXT:    vmovdqa %ymm0, 64(%rax)
1030; AVX512-NEXT:    vmovdqa64 %zmm3, (%rax)
1031; AVX512-NEXT:    vzeroupper
1032; AVX512-NEXT:    retq
1033;
1034; AVX512-FCP-LABEL: store_i16_stride6_vf8:
1035; AVX512-FCP:       # %bb.0:
1036; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1037; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
1038; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm1
1039; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm2
1040; AVX512-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
1041; AVX512-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
1042; AVX512-FCP-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm2
1043; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5]
1044; AVX512-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
1045; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm3
1046; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,10,11,14,15,2,3,10,11,16,17,24,25,16,17,20,21,24,25,26,27,18,19,26,27]
1047; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
1048; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3]
1049; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
1050; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1051; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3]
1052; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u]
1053; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
1054; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,4,1,5,0,4,1,5]
1055; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
1056; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm4, %ymm4
1057; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,8,9,0,1,4,5,4,5,12,13,2,3,10,11,18,19,22,23,24,25,30,31,20,21,28,29,24,25,28,29]
1058; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
1059; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm0[0,2,0,2]
1060; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
1061; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
1062; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,4,4,0,0,4,4,0]
1063; AVX512-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
1064; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm5, %ymm5
1065; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,12,13,8,9,u,u,u,u,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u]
1066; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
1067; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
1068; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [7,3,3,7,7,3,3,7]
1069; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
1070; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm4, %ymm0
1071; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,3,10,11,8,9,10,11,8,9,12,13,4,5,12,13,18,19,26,27,22,23,18,19,22,23,30,31,20,21,28,29]
1072; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
1073; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3]
1074; AVX512-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
1075; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
1076; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3]
1077; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31]
1078; AVX512-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1079; AVX512-FCP-NEXT:    vmovdqa %ymm0, 64(%rax)
1080; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
1081; AVX512-FCP-NEXT:    vzeroupper
1082; AVX512-FCP-NEXT:    retq
1083;
1084; AVX512DQ-LABEL: store_i16_stride6_vf8:
1085; AVX512DQ:       # %bb.0:
1086; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1087; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
1088; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm1
1089; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm2
1090; AVX512DQ-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
1091; AVX512DQ-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
1092; AVX512DQ-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm2
1093; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[0,2,1,3]
1094; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,6,7,14,15,2,3,10,11,16,17,24,25,16,17,24,25,24,25,26,27,18,19,26,27]
1095; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
1096; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3]
1097; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
1098; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1099; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3]
1100; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u]
1101; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
1102; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm1[0,2,0,2]
1103; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29]
1104; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
1105; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm0[0,2,0,2]
1106; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
1107; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
1108; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm2[0,2,0,2]
1109; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u]
1110; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
1111; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
1112; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
1113; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,3,10,11,8,9,10,11,4,5,12,13,4,5,12,13,18,19,26,27,22,23,30,31,22,23,30,31,20,21,28,29]
1114; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
1115; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3]
1116; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
1117; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
1118; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3]
1119; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31]
1120; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1121; AVX512DQ-NEXT:    vmovdqa %ymm0, 64(%rax)
1122; AVX512DQ-NEXT:    vmovdqa64 %zmm3, (%rax)
1123; AVX512DQ-NEXT:    vzeroupper
1124; AVX512DQ-NEXT:    retq
1125;
1126; AVX512DQ-FCP-LABEL: store_i16_stride6_vf8:
1127; AVX512DQ-FCP:       # %bb.0:
1128; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1129; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
1130; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm1
1131; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm2
1132; AVX512DQ-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
1133; AVX512DQ-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
1134; AVX512DQ-FCP-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm2
1135; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5]
1136; AVX512DQ-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
1137; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm3
1138; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,10,11,14,15,2,3,10,11,16,17,24,25,16,17,20,21,24,25,26,27,18,19,26,27]
1139; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
1140; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3]
1141; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
1142; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1143; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[0,2,1,3]
1144; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u]
1145; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
1146; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,4,1,5,0,4,1,5]
1147; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
1148; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm4, %ymm4
1149; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,8,9,0,1,4,5,4,5,12,13,2,3,10,11,18,19,22,23,24,25,30,31,20,21,28,29,24,25,28,29]
1150; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
1151; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm0[0,2,0,2]
1152; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
1153; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
1154; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,4,4,0,0,4,4,0]
1155; AVX512DQ-FCP-NEXT:    # ymm5 = mem[0,1,0,1]
1156; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm5, %ymm5
1157; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,12,13,8,9,u,u,u,u,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u]
1158; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
1159; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
1160; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [7,3,3,7,7,3,3,7]
1161; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
1162; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm4, %ymm0
1163; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,3,10,11,8,9,10,11,8,9,12,13,4,5,12,13,18,19,26,27,22,23,18,19,22,23,30,31,20,21,28,29]
1164; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
1165; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3]
1166; AVX512DQ-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
1167; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
1168; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3]
1169; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31]
1170; AVX512DQ-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1171; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, 64(%rax)
1172; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
1173; AVX512DQ-FCP-NEXT:    vzeroupper
1174; AVX512DQ-FCP-NEXT:    retq
1175;
1176; AVX512BW-LABEL: store_i16_stride6_vf8:
1177; AVX512BW:       # %bb.0:
1178; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1179; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
1180; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm1
1181; AVX512BW-NEXT:    vmovdqa (%r8), %xmm2
1182; AVX512BW-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
1183; AVX512BW-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
1184; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1185; AVX512BW-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm1
1186; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [21,29,37,45,6,14,22,30,38,46,7,15,23,31,39,47]
1187; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
1188; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,40,1,9,17,25,33,41,2,10,18,26,34,42,3,11,19,27,35,43,4,12,20,28,36,44,5,13]
1189; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
1190; AVX512BW-NEXT:    vmovdqa64 %zmm3, (%rax)
1191; AVX512BW-NEXT:    vmovdqa %ymm2, 64(%rax)
1192; AVX512BW-NEXT:    vzeroupper
1193; AVX512BW-NEXT:    retq
1194;
1195; AVX512BW-FCP-LABEL: store_i16_stride6_vf8:
1196; AVX512BW-FCP:       # %bb.0:
1197; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1198; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
1199; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %xmm1
1200; AVX512BW-FCP-NEXT:    vmovdqa (%r8), %xmm2
1201; AVX512BW-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
1202; AVX512BW-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
1203; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1204; AVX512BW-FCP-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm1
1205; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [21,29,37,45,6,14,22,30,38,46,7,15,23,31,39,47]
1206; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
1207; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,40,1,9,17,25,33,41,2,10,18,26,34,42,3,11,19,27,35,43,4,12,20,28,36,44,5,13]
1208; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
1209; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
1210; AVX512BW-FCP-NEXT:    vmovdqa %ymm2, 64(%rax)
1211; AVX512BW-FCP-NEXT:    vzeroupper
1212; AVX512BW-FCP-NEXT:    retq
1213;
1214; AVX512DQ-BW-LABEL: store_i16_stride6_vf8:
1215; AVX512DQ-BW:       # %bb.0:
1216; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1217; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
1218; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm1
1219; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %xmm2
1220; AVX512DQ-BW-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
1221; AVX512DQ-BW-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
1222; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1223; AVX512DQ-BW-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm1
1224; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [21,29,37,45,6,14,22,30,38,46,7,15,23,31,39,47]
1225; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
1226; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,40,1,9,17,25,33,41,2,10,18,26,34,42,3,11,19,27,35,43,4,12,20,28,36,44,5,13]
1227; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
1228; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, (%rax)
1229; AVX512DQ-BW-NEXT:    vmovdqa %ymm2, 64(%rax)
1230; AVX512DQ-BW-NEXT:    vzeroupper
1231; AVX512DQ-BW-NEXT:    retq
1232;
1233; AVX512DQ-BW-FCP-LABEL: store_i16_stride6_vf8:
1234; AVX512DQ-BW-FCP:       # %bb.0:
1235; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1236; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
1237; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %xmm1
1238; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r8), %xmm2
1239; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
1240; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
1241; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1242; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm1
1243; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [21,29,37,45,6,14,22,30,38,46,7,15,23,31,39,47]
1244; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
1245; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [0,8,16,24,32,40,1,9,17,25,33,41,2,10,18,26,34,42,3,11,19,27,35,43,4,12,20,28,36,44,5,13]
1246; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
1247; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
1248; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm2, 64(%rax)
1249; AVX512DQ-BW-FCP-NEXT:    vzeroupper
1250; AVX512DQ-BW-FCP-NEXT:    retq
1251  %in.vec0 = load <8 x i16>, ptr %in.vecptr0, align 64
1252  %in.vec1 = load <8 x i16>, ptr %in.vecptr1, align 64
1253  %in.vec2 = load <8 x i16>, ptr %in.vecptr2, align 64
1254  %in.vec3 = load <8 x i16>, ptr %in.vecptr3, align 64
1255  %in.vec4 = load <8 x i16>, ptr %in.vecptr4, align 64
1256  %in.vec5 = load <8 x i16>, ptr %in.vecptr5, align 64
1257  %1 = shufflevector <8 x i16> %in.vec0, <8 x i16> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1258  %2 = shufflevector <8 x i16> %in.vec2, <8 x i16> %in.vec3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1259  %3 = shufflevector <8 x i16> %in.vec4, <8 x i16> %in.vec5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1260  %4 = shufflevector <16 x i16> %1, <16 x i16> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1261  %5 = shufflevector <16 x i16> %3, <16 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1262  %6 = shufflevector <32 x i16> %4, <32 x i16> %5, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
1263  %interleaved.vec = shufflevector <48 x i16> %6, <48 x i16> poison, <48 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 7, i32 15, i32 23, i32 31, i32 39, i32 47>
1264  store <48 x i16> %interleaved.vec, ptr %out.vec, align 64
1265  ret void
1266}
1267
1268define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind {
1269; SSE-LABEL: store_i16_stride6_vf16:
1270; SSE:       # %bb.0:
1271; SSE-NEXT:    subq $24, %rsp
1272; SSE-NEXT:    movdqa (%rdi), %xmm15
1273; SSE-NEXT:    movdqa 16(%rdi), %xmm11
1274; SSE-NEXT:    movdqa (%rsi), %xmm10
1275; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1276; SSE-NEXT:    movdqa 16(%rsi), %xmm4
1277; SSE-NEXT:    movdqa (%rdx), %xmm14
1278; SSE-NEXT:    movdqa 16(%rdx), %xmm12
1279; SSE-NEXT:    movdqa (%rcx), %xmm3
1280; SSE-NEXT:    movdqa 16(%rcx), %xmm2
1281; SSE-NEXT:    movdqa 16(%r8), %xmm0
1282; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1283; SSE-NEXT:    movdqa 16(%r9), %xmm8
1284; SSE-NEXT:    movdqa %xmm12, %xmm6
1285; SSE-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
1286; SSE-NEXT:    movdqa %xmm11, %xmm5
1287; SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
1288; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1289; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,3],xmm6[3,3]
1290; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7]
1291; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,2],xmm0[0,1]
1292; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0,1,3]
1293; SSE-NEXT:    movaps {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,0]
1294; SSE-NEXT:    andps %xmm7, %xmm5
1295; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm8[0,2,2,3,4,5,6,7]
1296; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
1297; SSE-NEXT:    movaps %xmm7, %xmm0
1298; SSE-NEXT:    andnps %xmm9, %xmm0
1299; SSE-NEXT:    orps %xmm5, %xmm0
1300; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
1301; SSE-NEXT:    movdqa %xmm14, %xmm5
1302; SSE-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
1303; SSE-NEXT:    movdqa %xmm15, %xmm13
1304; SSE-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
1305; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1306; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[2,3],xmm5[3,3]
1307; SSE-NEXT:    movdqa (%r8), %xmm10
1308; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,6,5,7,7]
1309; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[1,2],xmm9[2,3]
1310; SSE-NEXT:    movdqa (%r9), %xmm9
1311; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,4,6,6,7]
1312; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
1313; SSE-NEXT:    movaps %xmm7, %xmm0
1314; SSE-NEXT:    andnps %xmm1, %xmm0
1315; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[2,0,1,3]
1316; SSE-NEXT:    andps %xmm7, %xmm13
1317; SSE-NEXT:    orps %xmm13, %xmm0
1318; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1319; SSE-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7]
1320; SSE-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
1321; SSE-NEXT:    movdqa %xmm11, %xmm1
1322; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,3],xmm12[3,3]
1323; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
1324; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,6,5,7,7]
1325; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,2],xmm4[2,3]
1326; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1327; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,6,6,7]
1328; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
1329; SSE-NEXT:    movaps %xmm7, %xmm0
1330; SSE-NEXT:    andnps %xmm4, %xmm0
1331; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
1332; SSE-NEXT:    andps %xmm7, %xmm1
1333; SSE-NEXT:    orps %xmm1, %xmm0
1334; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1335; SSE-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3]
1336; SSE-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
1337; SSE-NEXT:    # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3]
1338; SSE-NEXT:    movdqa %xmm15, %xmm1
1339; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,3],xmm14[3,3]
1340; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm10[2,1,3,3,4,5,6,7]
1341; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,2],xmm3[0,1]
1342; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
1343; SSE-NEXT:    andps %xmm7, %xmm1
1344; SSE-NEXT:    movdqa %xmm9, %xmm2
1345; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1346; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm9[0,2,2,3,4,5,6,7]
1347; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
1348; SSE-NEXT:    andnps %xmm3, %xmm7
1349; SSE-NEXT:    orps %xmm1, %xmm7
1350; SSE-NEXT:    movdqa %xmm12, %xmm3
1351; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm11[0]
1352; SSE-NEXT:    movdqa %xmm13, %xmm1
1353; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,1],xmm11[1,3]
1354; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[0,2]
1355; SSE-NEXT:    pslld $16, %xmm8
1356; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535]
1357; SSE-NEXT:    movdqa %xmm1, %xmm13
1358; SSE-NEXT:    pandn %xmm8, %xmm13
1359; SSE-NEXT:    andps %xmm1, %xmm3
1360; SSE-NEXT:    por %xmm3, %xmm13
1361; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1362; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1]
1363; SSE-NEXT:    movdqa %xmm10, %xmm4
1364; SSE-NEXT:    psrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
1365; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm5[1,1]
1366; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[0,2]
1367; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535]
1368; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3]
1369; SSE-NEXT:    movdqa %xmm3, %xmm9
1370; SSE-NEXT:    pandn %xmm0, %xmm9
1371; SSE-NEXT:    andps %xmm3, %xmm4
1372; SSE-NEXT:    por %xmm4, %xmm9
1373; SSE-NEXT:    movdqa %xmm15, %xmm0
1374; SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm14[1]
1375; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm10[1,1,1,1,4,5,6,7]
1376; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm14[1,1]
1377; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,2]
1378; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1]
1379; SSE-NEXT:    movdqa %xmm3, %xmm8
1380; SSE-NEXT:    pandn %xmm0, %xmm8
1381; SSE-NEXT:    andps %xmm3, %xmm4
1382; SSE-NEXT:    por %xmm4, %xmm8
1383; SSE-NEXT:    punpcklqdq {{.*#+}} xmm14 = xmm14[0],xmm15[0]
1384; SSE-NEXT:    movdqa %xmm10, %xmm0
1385; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[1,3]
1386; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[0,2]
1387; SSE-NEXT:    movdqa %xmm2, %xmm0
1388; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
1389; SSE-NEXT:    movdqa %xmm1, %xmm15
1390; SSE-NEXT:    pandn %xmm0, %xmm15
1391; SSE-NEXT:    andps %xmm1, %xmm14
1392; SSE-NEXT:    por %xmm14, %xmm15
1393; SSE-NEXT:    unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1]
1394; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1395; SSE-NEXT:    movdqa %xmm14, %xmm0
1396; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
1397; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm12[1,1]
1398; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[0,2]
1399; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
1400; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm12[2,2,3,3]
1401; SSE-NEXT:    movdqa %xmm3, %xmm11
1402; SSE-NEXT:    pandn %xmm4, %xmm11
1403; SSE-NEXT:    andps %xmm3, %xmm0
1404; SSE-NEXT:    por %xmm0, %xmm11
1405; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1406; SSE-NEXT:    movaps %xmm2, %xmm0
1407; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1]
1408; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm14[1,1,1,1,4,5,6,7]
1409; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm6[1,1]
1410; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,2]
1411; SSE-NEXT:    andps %xmm3, %xmm4
1412; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[0,0,1,1]
1413; SSE-NEXT:    pandn %xmm0, %xmm3
1414; SSE-NEXT:    por %xmm4, %xmm3
1415; SSE-NEXT:    movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0]
1416; SSE-NEXT:    movdqa %xmm14, %xmm0
1417; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[1,3]
1418; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[0,2]
1419; SSE-NEXT:    movdqa %xmm12, %xmm4
1420; SSE-NEXT:    pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5]
1421; SSE-NEXT:    movdqa %xmm1, %xmm0
1422; SSE-NEXT:    pandn %xmm4, %xmm0
1423; SSE-NEXT:    andps %xmm1, %xmm6
1424; SSE-NEXT:    por %xmm6, %xmm0
1425; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1426; SSE-NEXT:    movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0]
1427; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[2,1],xmm2[1,3]
1428; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0],xmm10[0,2]
1429; SSE-NEXT:    andps %xmm1, %xmm5
1430; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1431; SSE-NEXT:    pslld $16, %xmm2
1432; SSE-NEXT:    pandn %xmm2, %xmm1
1433; SSE-NEXT:    por %xmm5, %xmm1
1434; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1435; SSE-NEXT:    movdqa %xmm1, 48(%rax)
1436; SSE-NEXT:    movdqa %xmm0, 96(%rax)
1437; SSE-NEXT:    movdqa %xmm3, 112(%rax)
1438; SSE-NEXT:    movdqa %xmm11, 160(%rax)
1439; SSE-NEXT:    movdqa %xmm15, (%rax)
1440; SSE-NEXT:    movdqa %xmm8, 16(%rax)
1441; SSE-NEXT:    movdqa %xmm9, 64(%rax)
1442; SSE-NEXT:    movdqa %xmm13, 144(%rax)
1443; SSE-NEXT:    movaps %xmm7, 32(%rax)
1444; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1445; SSE-NEXT:    movaps %xmm0, 176(%rax)
1446; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1447; SSE-NEXT:    movaps %xmm0, 80(%rax)
1448; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
1449; SSE-NEXT:    movaps %xmm0, 128(%rax)
1450; SSE-NEXT:    addq $24, %rsp
1451; SSE-NEXT:    retq
1452;
1453; AVX-LABEL: store_i16_stride6_vf16:
1454; AVX:       # %bb.0:
1455; AVX-NEXT:    vmovdqa (%rcx), %xmm3
1456; AVX-NEXT:    vmovdqa 16(%rcx), %xmm0
1457; AVX-NEXT:    vmovdqa (%rdx), %xmm4
1458; AVX-NEXT:    vmovdqa 16(%rdx), %xmm1
1459; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1460; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm8[2,2,3,3]
1461; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1462; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
1463; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1464; AVX-NEXT:    vmovdqa (%rsi), %xmm5
1465; AVX-NEXT:    vmovdqa 16(%rsi), %xmm2
1466; AVX-NEXT:    vmovdqa (%rdi), %xmm6
1467; AVX-NEXT:    vmovdqa 16(%rdi), %xmm7
1468; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
1469; AVX-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3]
1470; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
1471; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[0,1,0,1]
1472; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm10, %ymm7
1473; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7]
1474; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm7
1475; AVX-NEXT:    vmovdqa 16(%r8), %xmm10
1476; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4,5],xmm7[6,7]
1477; AVX-NEXT:    vmovdqa 16(%r9), %xmm11
1478; AVX-NEXT:    vpslld $16, %xmm11, %xmm12
1479; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm12[5],xmm7[6,7]
1480; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1481; AVX-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm10[2,1,3,3,4,5,6,7]
1482; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
1483; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm7[0],xmm1[1,2],xmm7[3]
1484; AVX-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm11[0,2,2,3,4,5,6,7]
1485; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
1486; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3,4,5,6],xmm7[7]
1487; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1488; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1489; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1490; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2]
1491; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
1492; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
1493; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1494; AVX-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
1495; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3]
1496; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm11[2,2,3,3]
1497; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7]
1498; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1499; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1500; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,6,5,7,7]
1501; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
1502; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
1503; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,6,6,7]
1504; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
1505; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7]
1506; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
1507; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1508; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,2]
1509; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm1[2,2,3,3]
1510; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm0
1511; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm13[2,3,2,3]
1512; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm13, %ymm12
1513; AVX-NEXT:    vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7]
1514; AVX-NEXT:    vmovdqa (%r8), %xmm14
1515; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,6,5,7,7]
1516; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
1517; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm12
1518; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm0[0,1],xmm12[2,3,4,5],xmm0[6,7]
1519; AVX-NEXT:    vmovdqa (%r9), %xmm0
1520; AVX-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7]
1521; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
1522; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm2[1],xmm12[2,3,4,5,6],xmm2[7]
1523; AVX-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
1524; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm15[0],xmm2[1],xmm15[2,3]
1525; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm0[2,2,3,3]
1526; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm2[0,1,2],xmm15[3],xmm2[4,5,6,7]
1527; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm8[0,0,1,1]
1528; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
1529; AVX-NEXT:    vinsertf128 $1, %xmm8, %ymm2, %ymm2
1530; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm9[0,1,0,1]
1531; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm8, %ymm8
1532; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2,3],ymm2[4],ymm8[5,6],ymm2[7]
1533; AVX-NEXT:    vinsertps {{.*#+}} xmm8 = xmm2[0,1],xmm10[0],xmm2[3]
1534; AVX-NEXT:    vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5]
1535; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5],xmm8[6,7]
1536; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm2
1537; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm9 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
1538; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3],xmm2[4,5,6,7]
1539; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm11[0,0,1,1]
1540; AVX-NEXT:    vpblendw {{.*#+}} xmm9 = xmm2[0,1,2],xmm9[3],xmm2[4,5,6,7]
1541; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1542; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
1543; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2]
1544; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
1545; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1546; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,1]
1547; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm5
1548; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7]
1549; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm5
1550; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm6 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero
1551; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7]
1552; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1]
1553; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7]
1554; AVX-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm14[0],xmm3[3]
1555; AVX-NEXT:    vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
1556; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5],xmm3[6,7]
1557; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
1558; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
1559; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1560; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3]
1561; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm13[0,1,0,1]
1562; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
1563; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
1564; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm14[2,1,3,3,4,5,6,7]
1565; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
1566; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm1[1,2],xmm2[3]
1567; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
1568; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
1569; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5,6],xmm4[7]
1570; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1571; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4,5],xmm1[6,7]
1572; AVX-NEXT:    vpslld $16, %xmm0, %xmm0
1573; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7]
1574; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1575; AVX-NEXT:    vmovdqa %xmm0, 48(%rax)
1576; AVX-NEXT:    vmovdqa %xmm2, 32(%rax)
1577; AVX-NEXT:    vmovdqa %xmm3, (%rax)
1578; AVX-NEXT:    vmovdqa %xmm5, 16(%rax)
1579; AVX-NEXT:    vmovdqa %xmm9, 112(%rax)
1580; AVX-NEXT:    vmovdqa %xmm8, 96(%rax)
1581; AVX-NEXT:    vmovdqa %xmm15, 64(%rax)
1582; AVX-NEXT:    vmovdqa %xmm12, 80(%rax)
1583; AVX-NEXT:    vmovdqa %xmm7, 176(%rax)
1584; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1585; AVX-NEXT:    vmovaps %xmm0, 160(%rax)
1586; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1587; AVX-NEXT:    vmovaps %xmm0, 128(%rax)
1588; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1589; AVX-NEXT:    vmovaps %xmm0, 144(%rax)
1590; AVX-NEXT:    vzeroupper
1591; AVX-NEXT:    retq
1592;
1593; AVX2-LABEL: store_i16_stride6_vf16:
1594; AVX2:       # %bb.0:
1595; AVX2-NEXT:    vmovdqa (%rdi), %ymm1
1596; AVX2-NEXT:    vmovdqa (%rsi), %ymm3
1597; AVX2-NEXT:    vmovdqa (%rdx), %ymm2
1598; AVX2-NEXT:    vmovdqa (%rcx), %ymm4
1599; AVX2-NEXT:    vmovdqa (%r8), %ymm13
1600; AVX2-NEXT:    vmovdqa (%rcx), %xmm6
1601; AVX2-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
1602; AVX2-NEXT:    vmovdqa (%rdx), %xmm7
1603; AVX2-NEXT:    vpsrldq {{.*#+}} xmm8 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
1604; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
1605; AVX2-NEXT:    vpbroadcastq %xmm5, %ymm5
1606; AVX2-NEXT:    vmovdqa (%rsi), %xmm8
1607; AVX2-NEXT:    vpshufd {{.*#+}} xmm9 = xmm8[0,1,2,1]
1608; AVX2-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,7,6,5]
1609; AVX2-NEXT:    vmovdqa (%rdi), %xmm9
1610; AVX2-NEXT:    vpshufd {{.*#+}} xmm11 = xmm9[0,1,2,1]
1611; AVX2-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5]
1612; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
1613; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1]
1614; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3,4],ymm5[5],ymm10[6,7]
1615; AVX2-NEXT:    vmovdqa (%r8), %xmm10
1616; AVX2-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm10[2,1,3,3,4,5,6,7]
1617; AVX2-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
1618; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0],ymm5[1,2],ymm11[3],ymm5[4,5],ymm11[6],ymm5[7]
1619; AVX2-NEXT:    vmovdqa (%r9), %xmm11
1620; AVX2-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm11[0,2,2,3,4,5,6,7]
1621; AVX2-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4]
1622; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1]
1623; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
1624; AVX2-NEXT:    vpblendvb %ymm0, %ymm5, %ymm12, %ymm5
1625; AVX2-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1626; AVX2-NEXT:    vpsrldq {{.*#+}} ymm12 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
1627; AVX2-NEXT:    vpsrldq {{.*#+}} ymm14 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
1628; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11]
1629; AVX2-NEXT:    vpshufd {{.*#+}} ymm12 = ymm3[2,1,2,3,6,5,6,7]
1630; AVX2-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
1631; AVX2-NEXT:    vpshufd {{.*#+}} ymm15 = ymm1[2,1,2,3,6,5,6,7]
1632; AVX2-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm15[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
1633; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm15 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11]
1634; AVX2-NEXT:    vmovdqa (%r9), %ymm12
1635; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
1636; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3]
1637; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7]
1638; AVX2-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm13[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
1639; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
1640; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7]
1641; AVX2-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
1642; AVX2-NEXT:    vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
1643; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
1644; AVX2-NEXT:    vpblendvb %ymm0, %ymm14, %ymm15, %ymm0
1645; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1646; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
1647; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[1,1,1,1]
1648; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
1649; AVX2-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[1,2,3,3]
1650; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1]
1651; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7]
1652; AVX2-NEXT:    vpshufb {{.*#+}} xmm15 = xmm10[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
1653; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1]
1654; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7]
1655; AVX2-NEXT:    vpshufd {{.*#+}} xmm15 = xmm11[2,3,2,3]
1656; AVX2-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,1,4,5,6,7]
1657; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1]
1658; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
1659; AVX2-NEXT:    vpblendvb %ymm0, %ymm14, %ymm15, %ymm14
1660; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm15 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
1661; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3]
1662; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
1663; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7]
1664; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
1665; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4],ymm15[5],ymm5[6,7]
1666; AVX2-NEXT:    vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
1667; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3]
1668; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6],ymm15[7]
1669; AVX2-NEXT:    vpshufd {{.*#+}} ymm15 = ymm12[2,3,2,3,6,7,6,7]
1670; AVX2-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm15[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
1671; AVX2-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3]
1672; AVX2-NEXT:    vpblendvb %ymm0, %ymm5, %ymm15, %ymm0
1673; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
1674; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
1675; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
1676; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,0,2,2]
1677; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
1678; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7]
1679; AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm6 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
1680; AVX2-NEXT:    vpbroadcastq %xmm6, %ymm6
1681; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7]
1682; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7]
1683; AVX2-NEXT:    vpbroadcastq %xmm6, %ymm6
1684; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
1685; AVX2-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
1686; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
1687; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
1688; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
1689; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6]
1690; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
1691; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
1692; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
1693; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
1694; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
1695; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
1696; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
1697; AVX2-NEXT:    vpblendvb %ymm7, %ymm1, %ymm2, %ymm1
1698; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1699; AVX2-NEXT:    vmovdqa %ymm1, 96(%rax)
1700; AVX2-NEXT:    vmovdqa %ymm0, 160(%rax)
1701; AVX2-NEXT:    vmovdqa %ymm14, 64(%rax)
1702; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1703; AVX2-NEXT:    vmovaps %ymm0, 128(%rax)
1704; AVX2-NEXT:    vmovdqa %ymm5, (%rax)
1705; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1706; AVX2-NEXT:    vmovaps %ymm0, 32(%rax)
1707; AVX2-NEXT:    vzeroupper
1708; AVX2-NEXT:    retq
1709;
1710; AVX2-FP-LABEL: store_i16_stride6_vf16:
1711; AVX2-FP:       # %bb.0:
1712; AVX2-FP-NEXT:    subq $24, %rsp
1713; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm0
1714; AVX2-FP-NEXT:    vmovdqa (%rsi), %ymm10
1715; AVX2-FP-NEXT:    vmovdqa (%rdx), %ymm1
1716; AVX2-FP-NEXT:    vmovdqa (%rcx), %ymm2
1717; AVX2-FP-NEXT:    vmovdqa (%r8), %ymm8
1718; AVX2-FP-NEXT:    vmovaps (%r9), %ymm3
1719; AVX2-FP-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1720; AVX2-FP-NEXT:    vmovdqa (%rsi), %xmm7
1721; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm6
1722; AVX2-FP-NEXT:    vmovdqa (%rcx), %xmm9
1723; AVX2-FP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1724; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm4
1725; AVX2-FP-NEXT:    vmovdqa (%r8), %xmm5
1726; AVX2-FP-NEXT:    vmovdqa (%r9), %xmm3
1727; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
1728; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1]
1729; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm13 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
1730; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[1,2,3,3]
1731; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1]
1732; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
1733; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm5[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
1734; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1]
1735; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7]
1736; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm3[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
1737; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1]
1738; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
1739; AVX2-FP-NEXT:    vpblendvb %ymm14, %ymm12, %ymm13, %ymm9
1740; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1741; AVX2-FP-NEXT:    vmovdqa %ymm0, %ymm9
1742; AVX2-FP-NEXT:    vmovdqa %ymm10, %ymm11
1743; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm10[4],ymm0[5],ymm10[5],ymm0[6],ymm10[6],ymm0[7],ymm10[7],ymm0[12],ymm10[12],ymm0[13],ymm10[13],ymm0[14],ymm10[14],ymm0[15],ymm10[15]
1744; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[3,3,3,3]
1745; AVX2-FP-NEXT:    vmovdqa %ymm1, %ymm0
1746; AVX2-FP-NEXT:    vmovdqa %ymm2, %ymm10
1747; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm15 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
1748; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[1,2,3,3,5,6,7,7]
1749; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
1750; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7]
1751; AVX2-FP-NEXT:    vmovdqa %ymm8, %ymm1
1752; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
1753; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3]
1754; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5,6],ymm15[7]
1755; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
1756; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm8[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
1757; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3]
1758; AVX2-FP-NEXT:    vpblendvb %ymm14, %ymm13, %ymm15, %ymm2
1759; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1760; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
1761; AVX2-FP-NEXT:    vmovdqa %xmm6, %xmm13
1762; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
1763; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1764; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
1765; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[1,0,2,2]
1766; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1]
1767; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7]
1768; AVX2-FP-NEXT:    vpmovzxwd {{.*#+}} xmm15 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
1769; AVX2-FP-NEXT:    vpbroadcastq %xmm15, %ymm15
1770; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7]
1771; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm3[0,0,2,1,4,5,6,7]
1772; AVX2-FP-NEXT:    vpbroadcastq %xmm15, %ymm15
1773; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
1774; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm14, %ymm15, %ymm12
1775; AVX2-FP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1776; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm15 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[8],ymm11[8],ymm9[9],ymm11[9],ymm9[10],ymm11[10],ymm9[11],ymm11[11]
1777; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
1778; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm12 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[8],ymm10[8],ymm0[9],ymm10[9],ymm0[10],ymm10[10],ymm0[11],ymm10[11]
1779; AVX2-FP-NEXT:    vmovdqa %ymm0, %ymm14
1780; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[1,0,2,2,5,4,6,6]
1781; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3]
1782; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5,6],ymm12[7]
1783; AVX2-FP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
1784; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2]
1785; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7]
1786; AVX2-FP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
1787; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2]
1788; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm12, %ymm15, %ymm15
1789; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
1790; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm7, %xmm6
1791; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm13, %xmm0
1792; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
1793; AVX2-FP-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
1794; AVX2-FP-NEXT:    vpsrldq {{.*#+}} xmm7 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
1795; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
1796; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
1797; AVX2-FP-NEXT:    vpbroadcastq %xmm6, %ymm6
1798; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7]
1799; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm5[2,1,3,3,4,5,6,7]
1800; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
1801; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0],ymm0[1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7]
1802; AVX2-FP-NEXT:    vpshufb {{.*#+}} xmm6 = xmm3[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
1803; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
1804; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
1805; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm0, %ymm6, %ymm0
1806; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
1807; AVX2-FP-NEXT:    vpshufb %ymm6, %ymm11, %ymm5
1808; AVX2-FP-NEXT:    vpshufb %ymm6, %ymm9, %ymm3
1809; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11]
1810; AVX2-FP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
1811; AVX2-FP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
1812; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
1813; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
1814; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
1815; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
1816; AVX2-FP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
1817; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
1818; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
1819; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
1820; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
1821; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm1, %ymm2, %ymm1
1822; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1823; AVX2-FP-NEXT:    vmovdqa %ymm1, 128(%rax)
1824; AVX2-FP-NEXT:    vmovdqa %ymm15, 96(%rax)
1825; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1826; AVX2-FP-NEXT:    vmovaps %ymm1, 160(%rax)
1827; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1828; AVX2-FP-NEXT:    vmovaps %ymm1, 64(%rax)
1829; AVX2-FP-NEXT:    vmovdqa %ymm0, 32(%rax)
1830; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1831; AVX2-FP-NEXT:    vmovaps %ymm0, (%rax)
1832; AVX2-FP-NEXT:    addq $24, %rsp
1833; AVX2-FP-NEXT:    vzeroupper
1834; AVX2-FP-NEXT:    retq
1835;
1836; AVX2-FCP-LABEL: store_i16_stride6_vf16:
1837; AVX2-FCP:       # %bb.0:
1838; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm13
1839; AVX2-FCP-NEXT:    vmovdqa (%rsi), %ymm1
1840; AVX2-FCP-NEXT:    vmovdqa (%rdx), %ymm3
1841; AVX2-FCP-NEXT:    vmovdqa (%rcx), %ymm4
1842; AVX2-FCP-NEXT:    vmovdqa (%r8), %ymm2
1843; AVX2-FCP-NEXT:    vmovdqa (%rsi), %xmm6
1844; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
1845; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm6, %xmm8
1846; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm7
1847; AVX2-FCP-NEXT:    vpshufb %xmm5, %xmm7, %xmm5
1848; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
1849; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
1850; AVX2-FCP-NEXT:    vmovdqa (%rcx), %xmm8
1851; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} xmm9 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
1852; AVX2-FCP-NEXT:    vmovdqa (%rdx), %xmm10
1853; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} xmm11 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
1854; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
1855; AVX2-FCP-NEXT:    vpbroadcastq %xmm9, %ymm9
1856; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm9[2],ymm5[3,4],ymm9[5],ymm5[6,7]
1857; AVX2-FCP-NEXT:    vmovdqa (%r8), %xmm9
1858; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm9[2,1,3,3,4,5,6,7]
1859; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
1860; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0],ymm5[1,2],ymm11[3],ymm5[4,5],ymm11[6],ymm5[7]
1861; AVX2-FCP-NEXT:    vmovdqa (%r9), %xmm11
1862; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
1863; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1]
1864; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
1865; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm5, %ymm12, %ymm5
1866; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1867; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
1868; AVX2-FCP-NEXT:    vpshufb %ymm12, %ymm1, %ymm14
1869; AVX2-FCP-NEXT:    vpshufb %ymm12, %ymm13, %ymm12
1870; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[8],ymm14[8],ymm12[9],ymm14[9],ymm12[10],ymm14[10],ymm12[11],ymm14[11]
1871; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} ymm12 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
1872; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} ymm15 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
1873; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm15 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11]
1874; AVX2-FCP-NEXT:    vmovdqa (%r9), %ymm12
1875; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
1876; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2]
1877; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7]
1878; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
1879; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
1880; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7]
1881; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
1882; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
1883; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm14, %ymm15, %ymm0
1884; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1885; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
1886; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [1,0,0,2,0,0,3,0]
1887; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm15, %ymm14
1888; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
1889; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[1,1,1,1]
1890; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7]
1891; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm9[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
1892; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1]
1893; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7]
1894; AVX2-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm11[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
1895; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1]
1896; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
1897; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm14, %ymm15, %ymm14
1898; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm15 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
1899; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [5,0,0,6,0,0,7,0]
1900; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm5, %ymm5
1901; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm15 = ymm13[4],ymm1[4],ymm13[5],ymm1[5],ymm13[6],ymm1[6],ymm13[7],ymm1[7],ymm13[12],ymm1[12],ymm13[13],ymm1[13],ymm13[14],ymm1[14],ymm13[15],ymm1[15]
1902; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3]
1903; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4],ymm15[5],ymm5[6,7]
1904; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
1905; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3]
1906; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6],ymm15[7]
1907; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm12[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
1908; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3]
1909; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm5, %ymm15, %ymm0
1910; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
1911; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [1,0,2,2,1,0,2,2]
1912; AVX2-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
1913; AVX2-FCP-NEXT:    vpermd %ymm5, %ymm8, %ymm5
1914; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
1915; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
1916; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7]
1917; AVX2-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero
1918; AVX2-FCP-NEXT:    vpbroadcastq %xmm6, %ymm6
1919; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7]
1920; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7]
1921; AVX2-FCP-NEXT:    vpbroadcastq %xmm6, %ymm6
1922; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
1923; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
1924; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
1925; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [0,4,0,0,5,0,0,6]
1926; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm4, %ymm3
1927; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm1[0],ymm13[1],ymm1[1],ymm13[2],ymm1[2],ymm13[3],ymm1[3],ymm13[8],ymm1[8],ymm13[9],ymm1[9],ymm13[10],ymm1[10],ymm13[11],ymm1[11]
1928; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
1929; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7]
1930; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
1931; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
1932; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
1933; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
1934; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
1935; AVX2-FCP-NEXT:    vpblendvb %ymm7, %ymm1, %ymm2, %ymm1
1936; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1937; AVX2-FCP-NEXT:    vmovdqa %ymm1, 96(%rax)
1938; AVX2-FCP-NEXT:    vmovdqa %ymm0, 160(%rax)
1939; AVX2-FCP-NEXT:    vmovdqa %ymm14, 64(%rax)
1940; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1941; AVX2-FCP-NEXT:    vmovaps %ymm0, 128(%rax)
1942; AVX2-FCP-NEXT:    vmovdqa %ymm5, (%rax)
1943; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1944; AVX2-FCP-NEXT:    vmovaps %ymm0, 32(%rax)
1945; AVX2-FCP-NEXT:    vzeroupper
1946; AVX2-FCP-NEXT:    retq
1947;
1948; AVX512-LABEL: store_i16_stride6_vf16:
1949; AVX512:       # %bb.0:
1950; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1951; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
1952; AVX512-NEXT:    vmovdqa (%rsi), %ymm2
1953; AVX512-NEXT:    vmovdqa (%rdx), %ymm4
1954; AVX512-NEXT:    vmovdqa (%rcx), %ymm5
1955; AVX512-NEXT:    vmovdqa (%r8), %ymm1
1956; AVX512-NEXT:    vmovdqa (%r9), %ymm3
1957; AVX512-NEXT:    vpsrldq {{.*#+}} ymm6 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
1958; AVX512-NEXT:    vpsrldq {{.*#+}} ymm7 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
1959; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
1960; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm2[2,1,2,3,6,5,6,7]
1961; AVX512-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
1962; AVX512-NEXT:    vpshufd {{.*#+}} ymm8 = ymm0[2,1,2,3,6,5,6,7]
1963; AVX512-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
1964; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
1965; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,5,12,0,4,13,0,7]
1966; AVX512-NEXT:    vpermi2d %ymm6, %ymm7, %ymm8
1967; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm3[1,2,2,3,5,6,6,7]
1968; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7]
1969; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
1970; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
1971; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7]
1972; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm7 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
1973; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
1974; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [5,0,14,6,0,15,7,0]
1975; AVX512-NEXT:    vpermi2d %ymm7, %ymm8, %ymm9
1976; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm7
1977; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [8,21,10,11,22,13,14,23]
1978; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm9 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
1979; AVX512-NEXT:    vpermi2d %zmm9, %zmm7, %zmm8
1980; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm16
1981; AVX512-NEXT:    vmovdqa (%rcx), %xmm6
1982; AVX512-NEXT:    vmovdqa (%rdx), %xmm7
1983; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
1984; AVX512-NEXT:    vmovdqa (%rsi), %xmm8
1985; AVX512-NEXT:    vmovdqa (%rdi), %xmm10
1986; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
1987; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [0,8,0,1,9,0,2,10]
1988; AVX512-NEXT:    vpermi2d %ymm9, %ymm11, %ymm13
1989; AVX512-NEXT:    vmovdqa (%r9), %xmm9
1990; AVX512-NEXT:    vmovdqa (%r8), %xmm11
1991; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
1992; AVX512-NEXT:    vpbroadcastq %xmm14, %ymm14
1993; AVX512-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7]
1994; AVX512-NEXT:    vpsrldq {{.*#+}} xmm14 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
1995; AVX512-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
1996; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
1997; AVX512-NEXT:    vpshufd {{.*#+}} xmm15 = xmm8[0,1,2,1]
1998; AVX512-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,5]
1999; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm10[0,1,2,1]
2000; AVX512-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5]
2001; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
2002; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3]
2003; AVX512-NEXT:    vpermi2d %ymm14, %ymm12, %ymm15
2004; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm9[1,2,2,3]
2005; AVX512-NEXT:    vpshufd {{.*#+}} xmm14 = xmm11[1,2,2,3]
2006; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
2007; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1]
2008; AVX512-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0],ymm15[1,2],ymm12[3],ymm15[4,5],ymm12[6],ymm15[7]
2009; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm12
2010; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
2011; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
2012; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [4,12,0,5,13,0,6,14]
2013; AVX512-NEXT:    vpermi2d %ymm4, %ymm0, %ymm2
2014; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
2015; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
2016; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
2017; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
2018; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
2019; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,0,10,2,0,11,3,0]
2020; AVX512-NEXT:    vpermi2d %ymm1, %ymm2, %ymm3
2021; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
2022; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,9,2,3,10,5,6,11]
2023; AVX512-NEXT:    vpermi2d %ymm1, %ymm3, %ymm2
2024; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
2025; AVX512-NEXT:    vmovdqa64 %zmm0, 64(%rax)
2026; AVX512-NEXT:    vmovdqa64 %zmm12, (%rax)
2027; AVX512-NEXT:    vmovdqa64 %zmm16, 128(%rax)
2028; AVX512-NEXT:    vzeroupper
2029; AVX512-NEXT:    retq
2030;
2031; AVX512-FCP-LABEL: store_i16_stride6_vf16:
2032; AVX512-FCP:       # %bb.0:
2033; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2034; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm2
2035; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm3
2036; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm4
2037; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm5
2038; AVX512-FCP-NEXT:    vmovdqa64 (%r8), %ymm16
2039; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm1
2040; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm6
2041; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm7
2042; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
2043; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm8
2044; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm10
2045; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
2046; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,8,0,1,9,0,2,10]
2047; AVX512-FCP-NEXT:    vpermi2d %ymm9, %ymm11, %ymm12
2048; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm9
2049; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm11
2050; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
2051; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [0,1,8,3,4,9,6,7]
2052; AVX512-FCP-NEXT:    vpermi2d %ymm13, %ymm12, %ymm14
2053; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
2054; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm8, %xmm13
2055; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm10, %xmm12
2056; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
2057; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
2058; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
2059; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
2060; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3]
2061; AVX512-FCP-NEXT:    vpermi2d %ymm13, %ymm12, %ymm15
2062; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm12
2063; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15]
2064; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm15 = xmm9[1,2,2,3]
2065; AVX512-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm11[1,2,2,3]
2066; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
2067; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm12, %zmm13
2068; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm14, %zmm0
2069; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
2070; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
2071; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,0,10,2,0,11,3,0]
2072; AVX512-FCP-NEXT:    vpermi2d %ymm8, %ymm6, %ymm7
2073; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
2074; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,9,2,3,10,5,6,11]
2075; AVX512-FCP-NEXT:    vpermi2d %ymm6, %ymm7, %ymm8
2076; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
2077; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
2078; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [4,12,0,5,13,0,6,14]
2079; AVX512-FCP-NEXT:    vpermi2d %ymm6, %ymm7, %ymm9
2080; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm6
2081; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15]
2082; AVX512-FCP-NEXT:    vmovdqa64 %ymm16, %ymm10
2083; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11]
2084; AVX512-FCP-NEXT:    vpermi2d %zmm9, %zmm6, %zmm7
2085; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm6
2086; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
2087; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm3, %ymm8
2088; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm7
2089; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
2090; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm8 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
2091; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm9 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
2092; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
2093; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,5,12,0,4,13,0,7]
2094; AVX512-FCP-NEXT:    vpermi2d %ymm8, %ymm7, %ymm9
2095; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7]
2096; AVX512-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm16[1,2,2,3,5,6,6,7]
2097; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
2098; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7]
2099; AVX512-FCP-NEXT:    vpermi2d %ymm7, %ymm9, %ymm8
2100; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
2101; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
2102; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [5,0,14,6,0,15,7,0]
2103; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm4
2104; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm2
2105; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [8,21,10,11,22,13,14,23]
2106; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm10[4],ymm1[4],ymm10[5],ymm1[5],ymm10[6],ymm1[6],ymm10[7],ymm1[7],ymm10[12],ymm1[12],ymm10[13],ymm1[13],ymm10[14],ymm1[14],ymm10[15],ymm1[15]
2107; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
2108; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm8, %zmm1
2109; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 128(%rax)
2110; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 64(%rax)
2111; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
2112; AVX512-FCP-NEXT:    vzeroupper
2113; AVX512-FCP-NEXT:    retq
2114;
2115; AVX512DQ-LABEL: store_i16_stride6_vf16:
2116; AVX512DQ:       # %bb.0:
2117; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2118; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
2119; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm2
2120; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm4
2121; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm5
2122; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm1
2123; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm3
2124; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm6 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
2125; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm7 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
2126; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
2127; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm2[2,1,2,3,6,5,6,7]
2128; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
2129; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm0[2,1,2,3,6,5,6,7]
2130; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
2131; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
2132; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,5,12,0,4,13,0,7]
2133; AVX512DQ-NEXT:    vpermi2d %ymm6, %ymm7, %ymm8
2134; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm6 = ymm3[1,2,2,3,5,6,6,7]
2135; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7]
2136; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
2137; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
2138; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7]
2139; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm7 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
2140; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
2141; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [5,0,14,6,0,15,7,0]
2142; AVX512DQ-NEXT:    vpermi2d %ymm7, %ymm8, %ymm9
2143; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm7
2144; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [8,21,10,11,22,13,14,23]
2145; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm9 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
2146; AVX512DQ-NEXT:    vpermi2d %zmm9, %zmm7, %zmm8
2147; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm16
2148; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm6
2149; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm7
2150; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
2151; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm8
2152; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm10
2153; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
2154; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [0,8,0,1,9,0,2,10]
2155; AVX512DQ-NEXT:    vpermi2d %ymm9, %ymm11, %ymm13
2156; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm9
2157; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm11
2158; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
2159; AVX512DQ-NEXT:    vpbroadcastq %xmm14, %ymm14
2160; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7]
2161; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm14 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
2162; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
2163; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
2164; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm15 = xmm8[0,1,2,1]
2165; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,5]
2166; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm10[0,1,2,1]
2167; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5]
2168; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
2169; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3]
2170; AVX512DQ-NEXT:    vpermi2d %ymm14, %ymm12, %ymm15
2171; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm12 = xmm9[1,2,2,3]
2172; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm14 = xmm11[1,2,2,3]
2173; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
2174; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1]
2175; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0],ymm15[1,2],ymm12[3],ymm15[4,5],ymm12[6],ymm15[7]
2176; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm12
2177; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
2178; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
2179; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [4,12,0,5,13,0,6,14]
2180; AVX512DQ-NEXT:    vpermi2d %ymm4, %ymm0, %ymm2
2181; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
2182; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
2183; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
2184; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
2185; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
2186; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,0,10,2,0,11,3,0]
2187; AVX512DQ-NEXT:    vpermi2d %ymm1, %ymm2, %ymm3
2188; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
2189; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,9,2,3,10,5,6,11]
2190; AVX512DQ-NEXT:    vpermi2d %ymm1, %ymm3, %ymm2
2191; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
2192; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 64(%rax)
2193; AVX512DQ-NEXT:    vmovdqa64 %zmm12, (%rax)
2194; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 128(%rax)
2195; AVX512DQ-NEXT:    vzeroupper
2196; AVX512DQ-NEXT:    retq
2197;
2198; AVX512DQ-FCP-LABEL: store_i16_stride6_vf16:
2199; AVX512DQ-FCP:       # %bb.0:
2200; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2201; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm2
2202; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm3
2203; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm4
2204; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm5
2205; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r8), %ymm16
2206; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm1
2207; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm6
2208; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm7
2209; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
2210; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm8
2211; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm10
2212; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
2213; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [0,8,0,1,9,0,2,10]
2214; AVX512DQ-FCP-NEXT:    vpermi2d %ymm9, %ymm11, %ymm12
2215; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm9
2216; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm11
2217; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
2218; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [0,1,8,3,4,9,6,7]
2219; AVX512DQ-FCP-NEXT:    vpermi2d %ymm13, %ymm12, %ymm14
2220; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
2221; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm8, %xmm13
2222; AVX512DQ-FCP-NEXT:    vpshufb %xmm12, %xmm10, %xmm12
2223; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
2224; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
2225; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
2226; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
2227; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3]
2228; AVX512DQ-FCP-NEXT:    vpermi2d %ymm13, %ymm12, %ymm15
2229; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm12
2230; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15]
2231; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm15 = xmm9[1,2,2,3]
2232; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm11[1,2,2,3]
2233; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
2234; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm12, %zmm13
2235; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm13, %zmm14, %zmm0
2236; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
2237; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
2238; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [1,0,10,2,0,11,3,0]
2239; AVX512DQ-FCP-NEXT:    vpermi2d %ymm8, %ymm6, %ymm7
2240; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
2241; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [0,9,2,3,10,5,6,11]
2242; AVX512DQ-FCP-NEXT:    vpermi2d %ymm6, %ymm7, %ymm8
2243; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
2244; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
2245; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [4,12,0,5,13,0,6,14]
2246; AVX512DQ-FCP-NEXT:    vpermi2d %ymm6, %ymm7, %ymm9
2247; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm6
2248; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15]
2249; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm10
2250; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11]
2251; AVX512DQ-FCP-NEXT:    vpermi2d %zmm9, %zmm6, %zmm7
2252; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm6
2253; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
2254; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm3, %ymm8
2255; AVX512DQ-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm7
2256; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
2257; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm8 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
2258; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm9 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
2259; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
2260; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,5,12,0,4,13,0,7]
2261; AVX512DQ-FCP-NEXT:    vpermi2d %ymm8, %ymm7, %ymm9
2262; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7]
2263; AVX512DQ-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm16[1,2,2,3,5,6,6,7]
2264; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
2265; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7]
2266; AVX512DQ-FCP-NEXT:    vpermi2d %ymm7, %ymm9, %ymm8
2267; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
2268; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
2269; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [5,0,14,6,0,15,7,0]
2270; AVX512DQ-FCP-NEXT:    vpermi2d %ymm2, %ymm3, %ymm4
2271; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm2
2272; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [8,21,10,11,22,13,14,23]
2273; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm10[4],ymm1[4],ymm10[5],ymm1[5],ymm10[6],ymm1[6],ymm10[7],ymm1[7],ymm10[12],ymm1[12],ymm10[13],ymm1[13],ymm10[14],ymm1[14],ymm10[15],ymm1[15]
2274; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
2275; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm8, %zmm1
2276; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 128(%rax)
2277; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 64(%rax)
2278; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
2279; AVX512DQ-FCP-NEXT:    vzeroupper
2280; AVX512DQ-FCP-NEXT:    retq
2281;
2282; AVX512BW-LABEL: store_i16_stride6_vf16:
2283; AVX512BW:       # %bb.0:
2284; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2285; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
2286; AVX512BW-NEXT:    vmovdqa (%rdx), %ymm1
2287; AVX512BW-NEXT:    vmovdqa (%r8), %ymm2
2288; AVX512BW-NEXT:    vinserti64x4 $1, (%rsi), %zmm0, %zmm0
2289; AVX512BW-NEXT:    vinserti64x4 $1, (%rcx), %zmm1, %zmm1
2290; AVX512BW-NEXT:    vinserti64x4 $1, (%r9), %zmm2, %zmm2
2291; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [0,16,32,48,0,0,1,17,33,49,0,0,2,18,34,50,0,0,3,19,35,51,0,0,4,20,36,52,0,0,5,21]
2292; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
2293; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,32,48,6,7,8,9,33,49,12,13,14,15,34,50,18,19,20,21,35,51,24,25,26,27,36,52,30,31]
2294; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm4
2295; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [5,21,0,0,38,54,6,22,0,0,39,55,7,23,0,0,40,56,8,24,0,0,41,57,9,25,0,0,42,58,10,26]
2296; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm3
2297; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm5 = [0,1,37,53,4,5,6,7,38,54,10,11,12,13,39,55,16,17,18,19,40,56,22,23,24,25,41,57,28,29,30,31]
2298; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm5
2299; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [0,0,11,27,43,59,0,0,12,28,44,60,0,0,13,29,45,61,0,0,14,30,46,62,0,0,15,31,47,63,0,0]
2300; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
2301; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [42,58,2,3,4,5,43,59,8,9,10,11,44,60,14,15,16,17,45,61,20,21,22,23,46,62,26,27,28,29,47,63]
2302; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm0
2303; AVX512BW-NEXT:    vmovdqa64 %zmm0, 128(%rax)
2304; AVX512BW-NEXT:    vmovdqa64 %zmm5, 64(%rax)
2305; AVX512BW-NEXT:    vmovdqa64 %zmm4, (%rax)
2306; AVX512BW-NEXT:    vzeroupper
2307; AVX512BW-NEXT:    retq
2308;
2309; AVX512BW-FCP-LABEL: store_i16_stride6_vf16:
2310; AVX512BW-FCP:       # %bb.0:
2311; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2312; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm0
2313; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %ymm1
2314; AVX512BW-FCP-NEXT:    vmovdqa (%r8), %ymm2
2315; AVX512BW-FCP-NEXT:    vinserti64x4 $1, (%rsi), %zmm0, %zmm0
2316; AVX512BW-FCP-NEXT:    vinserti64x4 $1, (%rcx), %zmm1, %zmm1
2317; AVX512BW-FCP-NEXT:    vinserti64x4 $1, (%r9), %zmm2, %zmm2
2318; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [0,16,32,48,0,0,1,17,33,49,0,0,2,18,34,50,0,0,3,19,35,51,0,0,4,20,36,52,0,0,5,21]
2319; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
2320; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,32,48,6,7,8,9,33,49,12,13,14,15,34,50,18,19,20,21,35,51,24,25,26,27,36,52,30,31]
2321; AVX512BW-FCP-NEXT:    vpermi2w %zmm2, %zmm3, %zmm4
2322; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [5,21,0,0,38,54,6,22,0,0,39,55,7,23,0,0,40,56,8,24,0,0,41,57,9,25,0,0,42,58,10,26]
2323; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm1, %zmm3
2324; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm5 = [0,1,37,53,4,5,6,7,38,54,10,11,12,13,39,55,16,17,18,19,40,56,22,23,24,25,41,57,28,29,30,31]
2325; AVX512BW-FCP-NEXT:    vpermi2w %zmm2, %zmm3, %zmm5
2326; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [0,0,11,27,43,59,0,0,12,28,44,60,0,0,13,29,45,61,0,0,14,30,46,62,0,0,15,31,47,63,0,0]
2327; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
2328; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [42,58,2,3,4,5,43,59,8,9,10,11,44,60,14,15,16,17,45,61,20,21,22,23,46,62,26,27,28,29,47,63]
2329; AVX512BW-FCP-NEXT:    vpermi2w %zmm2, %zmm3, %zmm0
2330; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
2331; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rax)
2332; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
2333; AVX512BW-FCP-NEXT:    vzeroupper
2334; AVX512BW-FCP-NEXT:    retq
2335;
2336; AVX512DQ-BW-LABEL: store_i16_stride6_vf16:
2337; AVX512DQ-BW:       # %bb.0:
2338; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2339; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm0
2340; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %ymm1
2341; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %ymm2
2342; AVX512DQ-BW-NEXT:    vinserti64x4 $1, (%rsi), %zmm0, %zmm0
2343; AVX512DQ-BW-NEXT:    vinserti64x4 $1, (%rcx), %zmm1, %zmm1
2344; AVX512DQ-BW-NEXT:    vinserti64x4 $1, (%r9), %zmm2, %zmm2
2345; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [0,16,32,48,0,0,1,17,33,49,0,0,2,18,34,50,0,0,3,19,35,51,0,0,4,20,36,52,0,0,5,21]
2346; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
2347; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,32,48,6,7,8,9,33,49,12,13,14,15,34,50,18,19,20,21,35,51,24,25,26,27,36,52,30,31]
2348; AVX512DQ-BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm4
2349; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [5,21,0,0,38,54,6,22,0,0,39,55,7,23,0,0,40,56,8,24,0,0,41,57,9,25,0,0,42,58,10,26]
2350; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm3
2351; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm5 = [0,1,37,53,4,5,6,7,38,54,10,11,12,13,39,55,16,17,18,19,40,56,22,23,24,25,41,57,28,29,30,31]
2352; AVX512DQ-BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm5
2353; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [0,0,11,27,43,59,0,0,12,28,44,60,0,0,13,29,45,61,0,0,14,30,46,62,0,0,15,31,47,63,0,0]
2354; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
2355; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [42,58,2,3,4,5,43,59,8,9,10,11,44,60,14,15,16,17,45,61,20,21,22,23,46,62,26,27,28,29,47,63]
2356; AVX512DQ-BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm0
2357; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, 128(%rax)
2358; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, 64(%rax)
2359; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, (%rax)
2360; AVX512DQ-BW-NEXT:    vzeroupper
2361; AVX512DQ-BW-NEXT:    retq
2362;
2363; AVX512DQ-BW-FCP-LABEL: store_i16_stride6_vf16:
2364; AVX512DQ-BW-FCP:       # %bb.0:
2365; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2366; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm0
2367; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %ymm1
2368; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r8), %ymm2
2369; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, (%rsi), %zmm0, %zmm0
2370; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, (%rcx), %zmm1, %zmm1
2371; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, (%r9), %zmm2, %zmm2
2372; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [0,16,32,48,0,0,1,17,33,49,0,0,2,18,34,50,0,0,3,19,35,51,0,0,4,20,36,52,0,0,5,21]
2373; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
2374; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,32,48,6,7,8,9,33,49,12,13,14,15,34,50,18,19,20,21,35,51,24,25,26,27,36,52,30,31]
2375; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm2, %zmm3, %zmm4
2376; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [5,21,0,0,38,54,6,22,0,0,39,55,7,23,0,0,40,56,8,24,0,0,41,57,9,25,0,0,42,58,10,26]
2377; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm1, %zmm3
2378; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm5 = [0,1,37,53,4,5,6,7,38,54,10,11,12,13,39,55,16,17,18,19,40,56,22,23,24,25,41,57,28,29,30,31]
2379; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm2, %zmm3, %zmm5
2380; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [0,0,11,27,43,59,0,0,12,28,44,60,0,0,13,29,45,61,0,0,14,30,46,62,0,0,15,31,47,63,0,0]
2381; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
2382; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [42,58,2,3,4,5,43,59,8,9,10,11,44,60,14,15,16,17,45,61,20,21,22,23,46,62,26,27,28,29,47,63]
2383; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm2, %zmm3, %zmm0
2384; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
2385; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rax)
2386; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
2387; AVX512DQ-BW-FCP-NEXT:    vzeroupper
2388; AVX512DQ-BW-FCP-NEXT:    retq
2389  %in.vec0 = load <16 x i16>, ptr %in.vecptr0, align 64
2390  %in.vec1 = load <16 x i16>, ptr %in.vecptr1, align 64
2391  %in.vec2 = load <16 x i16>, ptr %in.vecptr2, align 64
2392  %in.vec3 = load <16 x i16>, ptr %in.vecptr3, align 64
2393  %in.vec4 = load <16 x i16>, ptr %in.vecptr4, align 64
2394  %in.vec5 = load <16 x i16>, ptr %in.vecptr5, align 64
2395  %1 = shufflevector <16 x i16> %in.vec0, <16 x i16> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2396  %2 = shufflevector <16 x i16> %in.vec2, <16 x i16> %in.vec3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2397  %3 = shufflevector <16 x i16> %in.vec4, <16 x i16> %in.vec5, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2398  %4 = shufflevector <32 x i16> %1, <32 x i16> %2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
2399  %5 = shufflevector <32 x i16> %3, <32 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2400  %6 = shufflevector <64 x i16> %4, <64 x i16> %5, <96 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
2401  %interleaved.vec = shufflevector <96 x i16> %6, <96 x i16> poison, <96 x i32> <i32 0, i32 16, i32 32, i32 48, i32 64, i32 80, i32 1, i32 17, i32 33, i32 49, i32 65, i32 81, i32 2, i32 18, i32 34, i32 50, i32 66, i32 82, i32 3, i32 19, i32 35, i32 51, i32 67, i32 83, i32 4, i32 20, i32 36, i32 52, i32 68, i32 84, i32 5, i32 21, i32 37, i32 53, i32 69, i32 85, i32 6, i32 22, i32 38, i32 54, i32 70, i32 86, i32 7, i32 23, i32 39, i32 55, i32 71, i32 87, i32 8, i32 24, i32 40, i32 56, i32 72, i32 88, i32 9, i32 25, i32 41, i32 57, i32 73, i32 89, i32 10, i32 26, i32 42, i32 58, i32 74, i32 90, i32 11, i32 27, i32 43, i32 59, i32 75, i32 91, i32 12, i32 28, i32 44, i32 60, i32 76, i32 92, i32 13, i32 29, i32 45, i32 61, i32 77, i32 93, i32 14, i32 30, i32 46, i32 62, i32 78, i32 94, i32 15, i32 31, i32 47, i32 63, i32 79, i32 95>
2402  store <96 x i16> %interleaved.vec, ptr %out.vec, align 64
2403  ret void
2404}
2405
2406define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind {
2407; SSE-LABEL: store_i16_stride6_vf32:
2408; SSE:       # %bb.0:
2409; SSE-NEXT:    subq $312, %rsp # imm = 0x138
2410; SSE-NEXT:    movdqa (%rdi), %xmm2
2411; SSE-NEXT:    movdqa 16(%rdi), %xmm13
2412; SSE-NEXT:    movdqa (%rsi), %xmm3
2413; SSE-NEXT:    movdqa 16(%rsi), %xmm1
2414; SSE-NEXT:    movdqa (%rdx), %xmm5
2415; SSE-NEXT:    movdqa 16(%rdx), %xmm14
2416; SSE-NEXT:    movdqa (%rcx), %xmm4
2417; SSE-NEXT:    movdqa 16(%rcx), %xmm10
2418; SSE-NEXT:    movdqa (%r8), %xmm8
2419; SSE-NEXT:    movdqa (%r9), %xmm11
2420; SSE-NEXT:    movdqa %xmm5, %xmm0
2421; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
2422; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2423; SSE-NEXT:    movdqa %xmm2, %xmm9
2424; SSE-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3]
2425; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2426; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[2,3],xmm0[3,3]
2427; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm8[2,1,3,3,4,5,6,7]
2428; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[1,2],xmm7[0,1]
2429; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[2,0,1,3]
2430; SSE-NEXT:    movaps {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,65535,0]
2431; SSE-NEXT:    andps %xmm6, %xmm9
2432; SSE-NEXT:    movdqa %xmm11, %xmm7
2433; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2434; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7]
2435; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1]
2436; SSE-NEXT:    movaps %xmm6, %xmm0
2437; SSE-NEXT:    andnps %xmm11, %xmm0
2438; SSE-NEXT:    orps %xmm9, %xmm0
2439; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2440; SSE-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
2441; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2442; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
2443; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2444; SSE-NEXT:    movdqa %xmm2, %xmm3
2445; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,3],xmm5[3,3]
2446; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,6,5,7,7]
2447; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,2],xmm4[2,3]
2448; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0,1,3]
2449; SSE-NEXT:    andps %xmm6, %xmm3
2450; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,6,6,7]
2451; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
2452; SSE-NEXT:    movaps %xmm6, %xmm0
2453; SSE-NEXT:    andnps %xmm4, %xmm0
2454; SSE-NEXT:    orps %xmm3, %xmm0
2455; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2456; SSE-NEXT:    movdqa %xmm14, %xmm0
2457; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
2458; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2459; SSE-NEXT:    movdqa %xmm13, %xmm11
2460; SSE-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3]
2461; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2462; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[2,3],xmm0[3,3]
2463; SSE-NEXT:    movdqa 16(%r8), %xmm15
2464; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm15[2,1,3,3,4,5,6,7]
2465; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[1,2],xmm9[0,1]
2466; SSE-NEXT:    movdqa 16(%r9), %xmm9
2467; SSE-NEXT:    pshuflw {{.*#+}} xmm12 = xmm9[0,2,2,3,4,5,6,7]
2468; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[0,1,2,1]
2469; SSE-NEXT:    movaps %xmm6, %xmm0
2470; SSE-NEXT:    andnps %xmm12, %xmm0
2471; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[2,0,1,3]
2472; SSE-NEXT:    andps %xmm6, %xmm11
2473; SSE-NEXT:    orps %xmm11, %xmm0
2474; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2475; SSE-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7]
2476; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2477; SSE-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
2478; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2479; SSE-NEXT:    movdqa %xmm13, %xmm1
2480; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,3],xmm14[3,3]
2481; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm15[0,1,2,3,6,5,7,7]
2482; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,2],xmm10[2,3]
2483; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,6,6,7]
2484; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[2,1,2,3]
2485; SSE-NEXT:    movaps %xmm6, %xmm0
2486; SSE-NEXT:    andnps %xmm10, %xmm0
2487; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
2488; SSE-NEXT:    andps %xmm6, %xmm1
2489; SSE-NEXT:    orps %xmm1, %xmm0
2490; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2491; SSE-NEXT:    movdqa 32(%rdx), %xmm2
2492; SSE-NEXT:    movdqa 32(%rcx), %xmm1
2493; SSE-NEXT:    movdqa %xmm2, %xmm0
2494; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2495; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2496; SSE-NEXT:    movdqa 32(%rdi), %xmm3
2497; SSE-NEXT:    movdqa 32(%rsi), %xmm11
2498; SSE-NEXT:    movdqa %xmm3, %xmm10
2499; SSE-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
2500; SSE-NEXT:    movdqa %xmm10, %xmm12
2501; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[2,3],xmm0[3,3]
2502; SSE-NEXT:    movdqa 32(%r8), %xmm14
2503; SSE-NEXT:    pshuflw {{.*#+}} xmm13 = xmm14[2,1,3,3,4,5,6,7]
2504; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[1,2],xmm13[0,1]
2505; SSE-NEXT:    movdqa 32(%r9), %xmm4
2506; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7]
2507; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2508; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
2509; SSE-NEXT:    movaps %xmm6, %xmm13
2510; SSE-NEXT:    andnps %xmm0, %xmm13
2511; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[2,0,1,3]
2512; SSE-NEXT:    andps %xmm6, %xmm12
2513; SSE-NEXT:    orps %xmm12, %xmm13
2514; SSE-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2515; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2516; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2517; SSE-NEXT:    movdqa %xmm3, %xmm0
2518; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
2519; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2520; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3],xmm2[3,3]
2521; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,5,7,7]
2522; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,3]
2523; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,6,6,7]
2524; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
2525; SSE-NEXT:    movaps %xmm6, %xmm11
2526; SSE-NEXT:    andnps %xmm1, %xmm11
2527; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
2528; SSE-NEXT:    andps %xmm6, %xmm0
2529; SSE-NEXT:    orps %xmm0, %xmm11
2530; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2531; SSE-NEXT:    movdqa 48(%rdx), %xmm3
2532; SSE-NEXT:    movdqa 48(%rcx), %xmm4
2533; SSE-NEXT:    movdqa %xmm3, %xmm5
2534; SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
2535; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2536; SSE-NEXT:    movdqa 48(%rdi), %xmm0
2537; SSE-NEXT:    movdqa 48(%rsi), %xmm1
2538; SSE-NEXT:    movdqa %xmm0, %xmm11
2539; SSE-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3]
2540; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2541; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[3,3]
2542; SSE-NEXT:    movdqa 48(%r8), %xmm12
2543; SSE-NEXT:    pshuflw {{.*#+}} xmm13 = xmm12[2,1,3,3,4,5,6,7]
2544; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[1,2],xmm13[0,1]
2545; SSE-NEXT:    movdqa 48(%r9), %xmm2
2546; SSE-NEXT:    pshuflw {{.*#+}} xmm13 = xmm2[0,2,2,3,4,5,6,7]
2547; SSE-NEXT:    movdqa %xmm2, (%rsp) # 16-byte Spill
2548; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1]
2549; SSE-NEXT:    movaps %xmm6, %xmm7
2550; SSE-NEXT:    andnps %xmm13, %xmm7
2551; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[2,0,1,3]
2552; SSE-NEXT:    andps %xmm6, %xmm11
2553; SSE-NEXT:    orps %xmm11, %xmm7
2554; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2555; SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
2556; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2557; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2558; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2559; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[3,3]
2560; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,6,5,7,7]
2561; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,3]
2562; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
2563; SSE-NEXT:    andps %xmm6, %xmm0
2564; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7]
2565; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
2566; SSE-NEXT:    andnps %xmm1, %xmm6
2567; SSE-NEXT:    orps %xmm0, %xmm6
2568; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2569; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2570; SSE-NEXT:    movaps %xmm5, %xmm0
2571; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2572; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
2573; SSE-NEXT:    movdqa %xmm8, %xmm1
2574; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[1,3]
2575; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2]
2576; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2577; SSE-NEXT:    movdqa %xmm3, %xmm11
2578; SSE-NEXT:    pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5]
2579; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535]
2580; SSE-NEXT:    movdqa %xmm1, %xmm2
2581; SSE-NEXT:    pandn %xmm11, %xmm2
2582; SSE-NEXT:    andps %xmm1, %xmm0
2583; SSE-NEXT:    por %xmm0, %xmm2
2584; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2585; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1]
2586; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm8[1,1,1,1,4,5,6,7]
2587; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[1,1],xmm5[1,1]
2588; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[2,0],xmm4[0,2]
2589; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535]
2590; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,0,1,1]
2591; SSE-NEXT:    movdqa %xmm5, %xmm0
2592; SSE-NEXT:    pandn %xmm6, %xmm0
2593; SSE-NEXT:    andps %xmm5, %xmm11
2594; SSE-NEXT:    por %xmm11, %xmm0
2595; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2596; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
2597; SSE-NEXT:    movaps %xmm7, %xmm6
2598; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2599; SSE-NEXT:    movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0]
2600; SSE-NEXT:    movdqa %xmm8, %xmm11
2601; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[2,1],xmm4[1,3]
2602; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,0],xmm11[0,2]
2603; SSE-NEXT:    movdqa %xmm3, %xmm0
2604; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm3[2,2,3,3]
2605; SSE-NEXT:    pslld $16, %xmm0
2606; SSE-NEXT:    movdqa %xmm1, %xmm2
2607; SSE-NEXT:    pandn %xmm0, %xmm2
2608; SSE-NEXT:    andps %xmm1, %xmm6
2609; SSE-NEXT:    por %xmm6, %xmm2
2610; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2611; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1]
2612; SSE-NEXT:    psrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
2613; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[1,1],xmm7[1,1]
2614; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[0,2]
2615; SSE-NEXT:    movdqa %xmm5, %xmm0
2616; SSE-NEXT:    pandn %xmm11, %xmm0
2617; SSE-NEXT:    andps %xmm5, %xmm8
2618; SSE-NEXT:    por %xmm8, %xmm0
2619; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2620; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2621; SSE-NEXT:    movaps %xmm3, %xmm6
2622; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2623; SSE-NEXT:    movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0]
2624; SSE-NEXT:    movdqa %xmm15, %xmm2
2625; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,3]
2626; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[0,2]
2627; SSE-NEXT:    movdqa %xmm9, %xmm8
2628; SSE-NEXT:    pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5]
2629; SSE-NEXT:    movdqa %xmm1, %xmm2
2630; SSE-NEXT:    pandn %xmm8, %xmm2
2631; SSE-NEXT:    andps %xmm1, %xmm6
2632; SSE-NEXT:    por %xmm6, %xmm2
2633; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2634; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
2635; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm15[1,1,1,1,4,5,6,7]
2636; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[1,1],xmm3[1,1]
2637; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[0,2]
2638; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm9[0,0,1,1]
2639; SSE-NEXT:    movdqa %xmm5, %xmm0
2640; SSE-NEXT:    pandn %xmm8, %xmm0
2641; SSE-NEXT:    andps %xmm5, %xmm6
2642; SSE-NEXT:    por %xmm6, %xmm0
2643; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2644; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2645; SSE-NEXT:    movaps %xmm2, %xmm8
2646; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2647; SSE-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0]
2648; SSE-NEXT:    movdqa %xmm15, %xmm6
2649; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,1],xmm0[1,3]
2650; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm6[0,2]
2651; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm9[2,2,3,3]
2652; SSE-NEXT:    pslld $16, %xmm9
2653; SSE-NEXT:    movdqa %xmm1, %xmm7
2654; SSE-NEXT:    pandn %xmm9, %xmm7
2655; SSE-NEXT:    andps %xmm1, %xmm8
2656; SSE-NEXT:    por %xmm8, %xmm7
2657; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
2658; SSE-NEXT:    psrldq {{.*#+}} xmm15 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
2659; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[1,1],xmm2[1,1]
2660; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[2,0],xmm0[0,2]
2661; SSE-NEXT:    movdqa %xmm5, %xmm8
2662; SSE-NEXT:    pandn %xmm11, %xmm8
2663; SSE-NEXT:    andps %xmm5, %xmm15
2664; SSE-NEXT:    por %xmm15, %xmm8
2665; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2666; SSE-NEXT:    movdqa %xmm2, %xmm9
2667; SSE-NEXT:    punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0]
2668; SSE-NEXT:    movdqa %xmm14, %xmm3
2669; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm10[1,3]
2670; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[2,0],xmm3[0,2]
2671; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2672; SSE-NEXT:    movdqa %xmm0, %xmm11
2673; SSE-NEXT:    pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5]
2674; SSE-NEXT:    movdqa %xmm1, %xmm6
2675; SSE-NEXT:    pandn %xmm11, %xmm6
2676; SSE-NEXT:    andps %xmm1, %xmm9
2677; SSE-NEXT:    por %xmm9, %xmm6
2678; SSE-NEXT:    punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm2[1]
2679; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm14[1,1,1,1,4,5,6,7]
2680; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[1,1],xmm2[1,1]
2681; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[2,0],xmm10[0,2]
2682; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm0[0,0,1,1]
2683; SSE-NEXT:    movdqa %xmm5, %xmm9
2684; SSE-NEXT:    pandn %xmm13, %xmm9
2685; SSE-NEXT:    andps %xmm5, %xmm11
2686; SSE-NEXT:    por %xmm11, %xmm9
2687; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2688; SSE-NEXT:    movaps %xmm3, %xmm11
2689; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2690; SSE-NEXT:    movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0]
2691; SSE-NEXT:    movdqa %xmm14, %xmm13
2692; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[2,1],xmm2[1,3]
2693; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[2,0],xmm13[0,2]
2694; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm0[2,2,3,3]
2695; SSE-NEXT:    pslld $16, %xmm0
2696; SSE-NEXT:    movdqa %xmm1, %xmm15
2697; SSE-NEXT:    pandn %xmm0, %xmm15
2698; SSE-NEXT:    andps %xmm1, %xmm11
2699; SSE-NEXT:    por %xmm11, %xmm15
2700; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
2701; SSE-NEXT:    psrldq {{.*#+}} xmm14 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
2702; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[1,1]
2703; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[2,0],xmm2[0,2]
2704; SSE-NEXT:    movdqa %xmm5, %xmm10
2705; SSE-NEXT:    pandn %xmm13, %xmm10
2706; SSE-NEXT:    andps %xmm5, %xmm14
2707; SSE-NEXT:    por %xmm14, %xmm10
2708; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2709; SSE-NEXT:    movaps %xmm2, %xmm11
2710; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2711; SSE-NEXT:    movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0]
2712; SSE-NEXT:    movdqa %xmm12, %xmm13
2713; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[1,3]
2714; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[2,0],xmm13[0,2]
2715; SSE-NEXT:    movdqa (%rsp), %xmm4 # 16-byte Reload
2716; SSE-NEXT:    movdqa %xmm4, %xmm14
2717; SSE-NEXT:    pslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4,5]
2718; SSE-NEXT:    movdqa %xmm1, %xmm13
2719; SSE-NEXT:    pandn %xmm14, %xmm13
2720; SSE-NEXT:    andps %xmm1, %xmm11
2721; SSE-NEXT:    por %xmm11, %xmm13
2722; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
2723; SSE-NEXT:    pshuflw {{.*#+}} xmm14 = xmm12[1,1,1,1,4,5,6,7]
2724; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[1,1],xmm2[1,1]
2725; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[0,2]
2726; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,0,1,1]
2727; SSE-NEXT:    movdqa %xmm5, %xmm11
2728; SSE-NEXT:    pandn %xmm2, %xmm11
2729; SSE-NEXT:    andps %xmm5, %xmm14
2730; SSE-NEXT:    por %xmm14, %xmm11
2731; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2732; SSE-NEXT:    movaps %xmm3, %xmm2
2733; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2734; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2735; SSE-NEXT:    movdqa %xmm12, %xmm14
2736; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[2,1],xmm0[1,3]
2737; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm14[0,2]
2738; SSE-NEXT:    andps %xmm1, %xmm2
2739; SSE-NEXT:    pshufd {{.*#+}} xmm14 = xmm4[2,2,3,3]
2740; SSE-NEXT:    pslld $16, %xmm4
2741; SSE-NEXT:    pandn %xmm4, %xmm1
2742; SSE-NEXT:    por %xmm2, %xmm1
2743; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
2744; SSE-NEXT:    psrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
2745; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[1,1],xmm3[1,1]
2746; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[0,2]
2747; SSE-NEXT:    andps %xmm5, %xmm12
2748; SSE-NEXT:    pandn %xmm14, %xmm5
2749; SSE-NEXT:    por %xmm12, %xmm5
2750; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2751; SSE-NEXT:    movdqa %xmm5, 352(%rax)
2752; SSE-NEXT:    movdqa %xmm1, 336(%rax)
2753; SSE-NEXT:    movdqa %xmm11, 304(%rax)
2754; SSE-NEXT:    movdqa %xmm13, 288(%rax)
2755; SSE-NEXT:    movdqa %xmm10, 256(%rax)
2756; SSE-NEXT:    movdqa %xmm15, 240(%rax)
2757; SSE-NEXT:    movdqa %xmm9, 208(%rax)
2758; SSE-NEXT:    movdqa %xmm6, 192(%rax)
2759; SSE-NEXT:    movdqa %xmm8, 160(%rax)
2760; SSE-NEXT:    movdqa %xmm7, 144(%rax)
2761; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2762; SSE-NEXT:    movaps %xmm0, 112(%rax)
2763; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2764; SSE-NEXT:    movaps %xmm0, 96(%rax)
2765; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2766; SSE-NEXT:    movaps %xmm0, 64(%rax)
2767; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2768; SSE-NEXT:    movaps %xmm0, 48(%rax)
2769; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2770; SSE-NEXT:    movaps %xmm0, 16(%rax)
2771; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2772; SSE-NEXT:    movaps %xmm0, (%rax)
2773; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2774; SSE-NEXT:    movaps %xmm0, 368(%rax)
2775; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2776; SSE-NEXT:    movaps %xmm0, 320(%rax)
2777; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2778; SSE-NEXT:    movaps %xmm0, 272(%rax)
2779; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2780; SSE-NEXT:    movaps %xmm0, 224(%rax)
2781; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2782; SSE-NEXT:    movaps %xmm0, 176(%rax)
2783; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2784; SSE-NEXT:    movaps %xmm0, 128(%rax)
2785; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2786; SSE-NEXT:    movaps %xmm0, 80(%rax)
2787; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2788; SSE-NEXT:    movaps %xmm0, 32(%rax)
2789; SSE-NEXT:    addq $312, %rsp # imm = 0x138
2790; SSE-NEXT:    retq
2791;
2792; AVX-LABEL: store_i16_stride6_vf32:
2793; AVX:       # %bb.0:
2794; AVX-NEXT:    subq $120, %rsp
2795; AVX-NEXT:    vmovdqa 32(%rcx), %xmm8
2796; AVX-NEXT:    vmovdqa 48(%rcx), %xmm0
2797; AVX-NEXT:    vmovdqa 32(%rdx), %xmm9
2798; AVX-NEXT:    vmovdqa 48(%rdx), %xmm1
2799; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2800; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3]
2801; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2802; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm3[0,0,1,1]
2803; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm1
2804; AVX-NEXT:    vmovdqa 32(%rsi), %xmm10
2805; AVX-NEXT:    vmovdqa 48(%rsi), %xmm2
2806; AVX-NEXT:    vmovdqa 32(%rdi), %xmm11
2807; AVX-NEXT:    vmovdqa 48(%rdi), %xmm4
2808; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
2809; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
2810; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
2811; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[0,1,0,1]
2812; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
2813; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
2814; AVX-NEXT:    vextractf128 $1, %ymm5, %xmm2
2815; AVX-NEXT:    vmovdqa 48(%r8), %xmm1
2816; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
2817; AVX-NEXT:    vmovdqa 48(%r9), %xmm2
2818; AVX-NEXT:    vpslld $16, %xmm2, %xmm12
2819; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm12[5],xmm7[6,7]
2820; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2821; AVX-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm1[2,1,3,3,4,5,6,7]
2822; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
2823; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3]
2824; AVX-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm2[0,2,2,3,4,5,6,7]
2825; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
2826; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2,3,4,5,6],xmm7[7]
2827; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2828; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
2829; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
2830; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,2]
2831; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
2832; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
2833; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
2834; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm4
2835; AVX-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,6,5,7,7]
2836; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
2837; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7]
2838; AVX-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,6,6,7]
2839; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
2840; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6],xmm5[7]
2841; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2842; AVX-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
2843; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3]
2844; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,2,3,3]
2845; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7]
2846; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2847; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
2848; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
2849; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,2]
2850; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3]
2851; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
2852; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3]
2853; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm5
2854; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
2855; AVX-NEXT:    vmovdqa 32(%r8), %xmm4
2856; AVX-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,5,7,7]
2857; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
2858; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm13
2859; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm13[2,3,4,5],xmm5[6,7]
2860; AVX-NEXT:    vmovdqa 32(%r9), %xmm5
2861; AVX-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm5[0,1,2,3,4,6,6,7]
2862; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3]
2863; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6],xmm14[7]
2864; AVX-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2865; AVX-NEXT:    vpsrldq {{.*#+}} xmm13 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
2866; AVX-NEXT:    vblendps {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3]
2867; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm5[2,2,3,3]
2868; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3],xmm12[4,5,6,7]
2869; AVX-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2870; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm6[0,0,1,1]
2871; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2]
2872; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm12, %ymm6
2873; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[0,1,0,1]
2874; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm12, %ymm0
2875; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6],ymm6[7]
2876; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm6
2877; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm12 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2878; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm12[2,3],xmm6[4,5,6,7]
2879; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm2[0,0,1,1]
2880; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3],xmm6[4,5,6,7]
2881; AVX-NEXT:    vmovdqa %xmm6, (%rsp) # 16-byte Spill
2882; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2883; AVX-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
2884; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
2885; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2886; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
2887; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
2888; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2]
2889; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2890; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
2891; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[0,1,0,1]
2892; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm6, %ymm6
2893; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6],ymm1[7]
2894; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm6
2895; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
2896; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6,7]
2897; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm5[0,0,1,1]
2898; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3],xmm6[4,5,6,7]
2899; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2900; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
2901; AVX-NEXT:    vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5]
2902; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5],xmm1[6,7]
2903; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2904; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
2905; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm7[0,0,1,1]
2906; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2907; AVX-NEXT:    vmovdqa 16(%rcx), %xmm1
2908; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
2909; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
2910; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
2911; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
2912; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
2913; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5],xmm2[6,7]
2914; AVX-NEXT:    vpslld $16, %xmm5, %xmm3
2915; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6,7]
2916; AVX-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2917; AVX-NEXT:    vmovdqa 16(%rdx), %xmm2
2918; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7]
2919; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
2920; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3]
2921; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm5[0,2,2,3,4,5,6,7]
2922; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
2923; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5,6],xmm3[7]
2924; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2925; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2926; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm11[2,2,3,3]
2927; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2928; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
2929; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2930; AVX-NEXT:    vmovdqa 16(%rsi), %xmm2
2931; AVX-NEXT:    vmovdqa 16(%rdi), %xmm3
2932; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
2933; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
2934; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm10[2,3,2,3]
2935; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,1]
2936; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
2937; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
2938; AVX-NEXT:    vmovdqa 16(%r8), %xmm3
2939; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
2940; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7]
2941; AVX-NEXT:    vmovdqa 16(%r9), %xmm2
2942; AVX-NEXT:    vpslld $16, %xmm2, %xmm6
2943; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5],xmm5[6,7]
2944; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2945; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7]
2946; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
2947; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm5[0],xmm0[1,2],xmm5[3]
2948; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm2[0,2,2,3,4,5,6,7]
2949; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
2950; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3,4,5,6],xmm5[7]
2951; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2952; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
2953; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
2954; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,2]
2955; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
2956; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1
2957; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
2958; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,7,7]
2959; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
2960; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm4
2961; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3,4,5],xmm1[6,7]
2962; AVX-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,6,6,7]
2963; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
2964; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3,4,5,6],xmm4[7]
2965; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2966; AVX-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
2967; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
2968; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
2969; AVX-NEXT:    vpblendw {{.*#+}} xmm15 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
2970; AVX-NEXT:    vmovdqa (%rcx), %xmm9
2971; AVX-NEXT:    vmovdqa (%rdx), %xmm8
2972; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
2973; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm5[1,1,2,2]
2974; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm5[2,2,3,3]
2975; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2976; AVX-NEXT:    vmovdqa (%rsi), %xmm7
2977; AVX-NEXT:    vmovdqa (%rdi), %xmm6
2978; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
2979; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3]
2980; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1
2981; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
2982; AVX-NEXT:    vmovdqa (%r8), %xmm1
2983; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,5,7,7]
2984; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
2985; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm13
2986; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3,4,5],xmm0[6,7]
2987; AVX-NEXT:    vmovdqa (%r9), %xmm0
2988; AVX-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,4,6,6,7]
2989; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3]
2990; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6],xmm14[7]
2991; AVX-NEXT:    vpsrldq {{.*#+}} xmm14 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
2992; AVX-NEXT:    vblendps {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3]
2993; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[2,2,3,3]
2994; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3],xmm12[4,5,6,7]
2995; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm11[0,0,1,1]
2996; AVX-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2]
2997; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm14, %ymm11
2998; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm10[0,1,0,1]
2999; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm14, %ymm10
3000; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6],ymm11[7]
3001; AVX-NEXT:    vextractf128 $1, %ymm11, %xmm10
3002; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm14 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
3003; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3],xmm10[4,5,6,7]
3004; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm2[0,0,1,1]
3005; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3],xmm10[4,5,6,7]
3006; AVX-NEXT:    vinsertps {{.*#+}} xmm3 = xmm11[0,1],xmm3[0],xmm11[3]
3007; AVX-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
3008; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5],xmm3[6,7]
3009; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
3010; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm3[0,0,1,1]
3011; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm3[1,1,2,2]
3012; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm8, %ymm8
3013; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
3014; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[0,1,0,1]
3015; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm7
3016; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7]
3017; AVX-NEXT:    vextractf128 $1, %ymm7, %xmm8
3018; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
3019; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7]
3020; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1]
3021; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5,6,7]
3022; AVX-NEXT:    vinsertps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0],xmm7[3]
3023; AVX-NEXT:    vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
3024; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm9[5],xmm7[6,7]
3025; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
3026; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1]
3027; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
3028; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
3029; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
3030; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
3031; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
3032; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm4
3033; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5],xmm4[6,7]
3034; AVX-NEXT:    vpslld $16, %xmm0, %xmm5
3035; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5],xmm4[6,7]
3036; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7]
3037; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
3038; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1,2],xmm1[3]
3039; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3040; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
3041; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6],xmm0[7]
3042; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3043; AVX-NEXT:    vmovdqa %xmm0, 32(%rax)
3044; AVX-NEXT:    vmovdqa %xmm4, 48(%rax)
3045; AVX-NEXT:    vmovdqa %xmm7, (%rax)
3046; AVX-NEXT:    vmovdqa %xmm8, 16(%rax)
3047; AVX-NEXT:    vmovdqa %xmm2, 96(%rax)
3048; AVX-NEXT:    vmovdqa %xmm10, 112(%rax)
3049; AVX-NEXT:    vmovdqa %xmm12, 64(%rax)
3050; AVX-NEXT:    vmovdqa %xmm13, 80(%rax)
3051; AVX-NEXT:    vmovdqa %xmm15, 160(%rax)
3052; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3053; AVX-NEXT:    vmovaps %xmm0, 176(%rax)
3054; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3055; AVX-NEXT:    vmovaps %xmm0, 128(%rax)
3056; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3057; AVX-NEXT:    vmovaps %xmm0, 144(%rax)
3058; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3059; AVX-NEXT:    vmovaps %xmm0, 224(%rax)
3060; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3061; AVX-NEXT:    vmovaps %xmm0, 240(%rax)
3062; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3063; AVX-NEXT:    vmovaps %xmm0, 192(%rax)
3064; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3065; AVX-NEXT:    vmovaps %xmm0, 208(%rax)
3066; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3067; AVX-NEXT:    vmovaps %xmm0, 288(%rax)
3068; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
3069; AVX-NEXT:    vmovaps %xmm0, 304(%rax)
3070; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3071; AVX-NEXT:    vmovaps %xmm0, 256(%rax)
3072; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3073; AVX-NEXT:    vmovaps %xmm0, 272(%rax)
3074; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3075; AVX-NEXT:    vmovaps %xmm0, 352(%rax)
3076; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3077; AVX-NEXT:    vmovaps %xmm0, 368(%rax)
3078; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3079; AVX-NEXT:    vmovaps %xmm0, 320(%rax)
3080; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3081; AVX-NEXT:    vmovaps %xmm0, 336(%rax)
3082; AVX-NEXT:    addq $120, %rsp
3083; AVX-NEXT:    vzeroupper
3084; AVX-NEXT:    retq
3085;
3086; AVX2-LABEL: store_i16_stride6_vf32:
3087; AVX2:       # %bb.0:
3088; AVX2-NEXT:    subq $616, %rsp # imm = 0x268
3089; AVX2-NEXT:    vmovdqa (%rcx), %xmm13
3090; AVX2-NEXT:    vmovdqa 32(%rcx), %xmm9
3091; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
3092; AVX2-NEXT:    vmovdqa (%rdx), %xmm1
3093; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3094; AVX2-NEXT:    vmovdqa 32(%rdx), %xmm11
3095; AVX2-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
3096; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3097; AVX2-NEXT:    vpbroadcastq %xmm0, %ymm0
3098; AVX2-NEXT:    vmovdqa (%rsi), %xmm15
3099; AVX2-NEXT:    vmovdqa 32(%rsi), %xmm5
3100; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm15[0,1,2,1]
3101; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
3102; AVX2-NEXT:    vmovdqa (%rdi), %xmm2
3103; AVX2-NEXT:    vmovdqa %xmm2, (%rsp) # 16-byte Spill
3104; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm14
3105; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
3106; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
3107; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
3108; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
3109; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
3110; AVX2-NEXT:    vmovdqa (%r8), %xmm1
3111; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3112; AVX2-NEXT:    vmovdqa 32(%r8), %xmm7
3113; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7]
3114; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
3115; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
3116; AVX2-NEXT:    vmovdqa (%r9), %xmm0
3117; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3118; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3119; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
3120; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1]
3121; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
3122; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
3123; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3124; AVX2-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
3125; AVX2-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3126; AVX2-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
3127; AVX2-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3128; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3129; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm5[0,1,2,1]
3130; AVX2-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3131; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
3132; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm14[0,1,2,1]
3133; AVX2-NEXT:    vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3134; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
3135; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
3136; AVX2-NEXT:    vmovdqa 32(%r9), %xmm4
3137; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
3138; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
3139; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
3140; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm7[2,1,3,3,4,5,6,7]
3141; AVX2-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3142; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
3143; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
3144; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7]
3145; AVX2-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3146; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
3147; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
3148; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
3149; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3150; AVX2-NEXT:    vmovdqa 32(%rdx), %ymm2
3151; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3152; AVX2-NEXT:    vmovdqa 32(%rcx), %ymm1
3153; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3154; AVX2-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
3155; AVX2-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
3156; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
3157; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm10
3158; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm10[2,1,2,3,6,5,6,7]
3159; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3160; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
3161; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm8
3162; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm8[2,1,2,3,6,5,6,7]
3163; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3164; AVX2-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
3165; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
3166; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
3167; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
3168; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
3169; AVX2-NEXT:    vmovdqa 32(%r8), %ymm2
3170; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3171; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
3172; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
3173; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
3174; AVX2-NEXT:    vmovdqa 32(%r9), %ymm2
3175; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3176; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
3177; AVX2-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
3178; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
3179; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
3180; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3181; AVX2-NEXT:    vmovdqa (%rdx), %ymm2
3182; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3183; AVX2-NEXT:    vmovdqa (%rcx), %ymm1
3184; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3185; AVX2-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
3186; AVX2-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
3187; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
3188; AVX2-NEXT:    vmovdqa (%rsi), %ymm2
3189; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3190; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,1,2,3,6,5,6,7]
3191; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
3192; AVX2-NEXT:    vmovdqa (%rdi), %ymm12
3193; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm12[2,1,2,3,6,5,6,7]
3194; AVX2-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
3195; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
3196; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
3197; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
3198; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
3199; AVX2-NEXT:    vmovdqa (%r8), %ymm2
3200; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3201; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
3202; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
3203; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
3204; AVX2-NEXT:    vmovdqa (%r9), %ymm2
3205; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3206; AVX2-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
3207; AVX2-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
3208; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
3209; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm6, %ymm0
3210; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3211; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7]
3212; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
3213; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
3214; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3]
3215; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
3216; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
3217; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
3218; AVX2-NEXT:    vpshufb %xmm1, %xmm7, %xmm6
3219; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
3220; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6],ymm6[7]
3221; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
3222; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7]
3223; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm0[0,1,0,1]
3224; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
3225; AVX2-NEXT:    vpblendvb %ymm3, %ymm6, %ymm14, %ymm0
3226; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3227; AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
3228; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
3229; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1]
3230; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3231; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7]
3232; AVX2-NEXT:    vpshufd {{.*#+}} xmm14 = xmm14[1,2,3,3]
3233; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
3234; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm6[2],ymm14[3,4],ymm6[5],ymm14[6,7]
3235; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
3236; AVX2-NEXT:    vpshufb %xmm1, %xmm11, %xmm1
3237; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
3238; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6],ymm1[7]
3239; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3240; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm7[2,3,2,3]
3241; AVX2-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,1,4,5,6,7]
3242; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
3243; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm6, %ymm1
3244; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3245; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[12],ymm10[12],ymm8[13],ymm10[13],ymm8[14],ymm10[14],ymm8[15],ymm10[15]
3246; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
3247; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3248; AVX2-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
3249; AVX2-NEXT:    # ymm6 = ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[12],mem[12],ymm2[13],mem[13],ymm2[14],mem[14],ymm2[15],mem[15]
3250; AVX2-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[1,2,3,3,5,6,7,7]
3251; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
3252; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7]
3253; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
3254; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3255; AVX2-NEXT:    vpshufb %ymm6, %ymm2, %ymm14
3256; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
3257; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3],ymm14[4],ymm1[5,6],ymm14[7]
3258; AVX2-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
3259; AVX2-NEXT:    # ymm14 = mem[2,3,2,3,6,7,6,7]
3260; AVX2-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm14[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
3261; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
3262; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm14, %ymm1
3263; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3264; AVX2-NEXT:    vmovdqa %ymm3, %ymm14
3265; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3266; AVX2-NEXT:    vmovdqa %ymm12, %ymm3
3267; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm4[4],ymm12[5],ymm4[5],ymm12[6],ymm4[6],ymm12[7],ymm4[7],ymm12[12],ymm4[12],ymm12[13],ymm4[13],ymm12[14],ymm4[14],ymm12[15],ymm4[15]
3268; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
3269; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3270; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3271; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm12 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15]
3272; AVX2-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[1,2,3,3,5,6,7,7]
3273; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
3274; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2],ymm12[3,4],ymm1[5],ymm12[6,7]
3275; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
3276; AVX2-NEXT:    vpshufb %ymm6, %ymm8, %ymm6
3277; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
3278; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6],ymm6[7]
3279; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
3280; AVX2-NEXT:    vpshufd {{.*#+}} ymm6 = ymm10[2,3,2,3,6,7,6,7]
3281; AVX2-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
3282; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
3283; AVX2-NEXT:    vpblendvb %ymm14, %ymm1, %ymm6, %ymm6
3284; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
3285; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3]
3286; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
3287; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2]
3288; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
3289; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
3290; AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero
3291; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
3292; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
3293; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm7[0,0,2,1,4,5,6,7]
3294; AVX2-NEXT:    vpbroadcastq %xmm0, %ymm12
3295; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
3296; AVX2-NEXT:    vpblendvb %ymm15, %ymm1, %ymm12, %ymm12
3297; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3298; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
3299; AVX2-NEXT:    # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
3300; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3301; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
3302; AVX2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
3303; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
3304; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,2]
3305; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
3306; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
3307; AVX2-NEXT:    vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3308; AVX2-NEXT:    # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
3309; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
3310; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
3311; AVX2-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3312; AVX2-NEXT:    # xmm1 = mem[0,0,2,1,4,5,6,7]
3313; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
3314; AVX2-NEXT:    vpblendvb %ymm15, %ymm0, %ymm1, %ymm1
3315; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3316; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3317; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11]
3318; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3319; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm13 # 32-byte Folded Reload
3320; AVX2-NEXT:    # ymm13 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[2],mem[2],ymm7[3],mem[3],ymm7[8],mem[8],ymm7[9],mem[9],ymm7[10],mem[10],ymm7[11],mem[11]
3321; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
3322; AVX2-NEXT:    vpshufd {{.*#+}} ymm13 = ymm13[1,0,2,2,5,4,6,6]
3323; AVX2-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3]
3324; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3],ymm13[4],ymm0[5,6],ymm13[7]
3325; AVX2-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
3326; AVX2-NEXT:    # ymm11 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
3327; AVX2-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
3328; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7]
3329; AVX2-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
3330; AVX2-NEXT:    # ymm9 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
3331; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2]
3332; AVX2-NEXT:    vpblendvb %ymm15, %ymm0, %ymm9, %ymm0
3333; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
3334; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[8],ymm5[8],ymm2[9],ymm5[9],ymm2[10],ymm5[10],ymm2[11],ymm5[11]
3335; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
3336; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[1,0,2,2,5,4,6,6]
3337; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3]
3338; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5,6],ymm7[7]
3339; AVX2-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
3340; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2]
3341; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
3342; AVX2-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
3343; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
3344; AVX2-NEXT:    vpblendvb %ymm15, %ymm4, %ymm3, %ymm2
3345; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3346; AVX2-NEXT:    vmovdqa %ymm2, 96(%rax)
3347; AVX2-NEXT:    vmovdqa %ymm6, 160(%rax)
3348; AVX2-NEXT:    vmovdqa %ymm0, 288(%rax)
3349; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3350; AVX2-NEXT:    vmovaps %ymm0, 352(%rax)
3351; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3352; AVX2-NEXT:    vmovaps %ymm0, 64(%rax)
3353; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3354; AVX2-NEXT:    vmovaps %ymm0, 128(%rax)
3355; AVX2-NEXT:    vmovdqa %ymm1, 192(%rax)
3356; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3357; AVX2-NEXT:    vmovaps %ymm0, 256(%rax)
3358; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3359; AVX2-NEXT:    vmovaps %ymm0, 320(%rax)
3360; AVX2-NEXT:    vmovdqa %ymm12, (%rax)
3361; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3362; AVX2-NEXT:    vmovaps %ymm0, 224(%rax)
3363; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3364; AVX2-NEXT:    vmovaps %ymm0, 32(%rax)
3365; AVX2-NEXT:    addq $616, %rsp # imm = 0x268
3366; AVX2-NEXT:    vzeroupper
3367; AVX2-NEXT:    retq
3368;
3369; AVX2-FP-LABEL: store_i16_stride6_vf32:
3370; AVX2-FP:       # %bb.0:
3371; AVX2-FP-NEXT:    subq $648, %rsp # imm = 0x288
3372; AVX2-FP-NEXT:    vmovdqa (%rsi), %xmm1
3373; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3374; AVX2-FP-NEXT:    vmovdqa 32(%rsi), %xmm8
3375; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
3376; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
3377; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm2
3378; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3379; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm5
3380; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
3381; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
3382; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
3383; AVX2-FP-NEXT:    vmovdqa (%rcx), %xmm2
3384; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3385; AVX2-FP-NEXT:    vmovdqa 32(%rcx), %xmm6
3386; AVX2-FP-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
3387; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm3
3388; AVX2-FP-NEXT:    vmovdqa %xmm3, (%rsp) # 16-byte Spill
3389; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %xmm7
3390; AVX2-FP-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
3391; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
3392; AVX2-FP-NEXT:    vpbroadcastq %xmm2, %ymm2
3393; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
3394; AVX2-FP-NEXT:    vmovdqa (%r8), %xmm2
3395; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3396; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7]
3397; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
3398; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
3399; AVX2-FP-NEXT:    vmovdqa (%r9), %xmm3
3400; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3401; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
3402; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
3403; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
3404; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
3405; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm2, %ymm3, %ymm2
3406; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3407; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm8, %xmm2
3408; AVX2-FP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3409; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
3410; AVX2-FP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3411; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
3412; AVX2-FP-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
3413; AVX2-FP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3414; AVX2-FP-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
3415; AVX2-FP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3416; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
3417; AVX2-FP-NEXT:    vmovdqa 32(%r8), %xmm9
3418; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
3419; AVX2-FP-NEXT:    vpbroadcastq %xmm2, %ymm2
3420; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
3421; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm9[2,1,3,3,4,5,6,7]
3422; AVX2-FP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3423; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
3424; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7]
3425; AVX2-FP-NEXT:    vmovdqa 32(%r9), %xmm14
3426; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm14, %xmm1
3427; AVX2-FP-NEXT:    vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3428; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
3429; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm0, %ymm1, %ymm0
3430; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3431; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm11
3432; AVX2-FP-NEXT:    vmovdqa 32(%rsi), %ymm13
3433; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
3434; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm13, %ymm1
3435; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm11, %ymm2
3436; AVX2-FP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3437; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
3438; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %ymm12
3439; AVX2-FP-NEXT:    vmovdqa 32(%rcx), %ymm2
3440; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3441; AVX2-FP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
3442; AVX2-FP-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
3443; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
3444; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
3445; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
3446; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
3447; AVX2-FP-NEXT:    vmovdqa 32(%r8), %ymm2
3448; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3449; AVX2-FP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
3450; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
3451; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
3452; AVX2-FP-NEXT:    vmovdqa 32(%r9), %ymm15
3453; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
3454; AVX2-FP-NEXT:    # ymm4 = mem[0,1,0,1]
3455; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm15, %ymm2
3456; AVX2-FP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3457; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
3458; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm1, %ymm2, %ymm1
3459; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3460; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm2
3461; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3462; AVX2-FP-NEXT:    vmovdqa (%rsi), %ymm1
3463; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3464; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
3465; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
3466; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
3467; AVX2-FP-NEXT:    vmovdqa (%rdx), %ymm3
3468; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3469; AVX2-FP-NEXT:    vmovdqa (%rcx), %ymm1
3470; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3471; AVX2-FP-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
3472; AVX2-FP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
3473; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
3474; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
3475; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
3476; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
3477; AVX2-FP-NEXT:    vmovdqa (%r8), %ymm1
3478; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3479; AVX2-FP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
3480; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
3481; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
3482; AVX2-FP-NEXT:    vmovdqa (%r9), %ymm1
3483; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3484; AVX2-FP-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
3485; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
3486; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm0, %ymm1, %ymm0
3487; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3488; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
3489; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
3490; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
3491; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3]
3492; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
3493; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
3494; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
3495; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm9, %xmm4
3496; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
3497; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6],ymm4[7]
3498; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
3499; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm14, %xmm4
3500; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm4[0,1,0,1]
3501; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
3502; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm0, %ymm14, %ymm0
3503; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3504; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3505; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3506; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
3507; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
3508; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3509; AVX2-FP-NEXT:    vmovdqa (%rsp), %xmm2 # 16-byte Reload
3510; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
3511; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm14[1,2,3,3]
3512; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
3513; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7]
3514; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3515; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
3516; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
3517; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
3518; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3519; AVX2-FP-NEXT:    vpshufb %xmm7, %xmm8, %xmm1
3520; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
3521; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm0, %ymm1, %ymm0
3522; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3523; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15]
3524; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3]
3525; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
3526; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[12],ymm10[12],ymm12[13],ymm10[13],ymm12[14],ymm10[14],ymm12[15],ymm10[15]
3527; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7]
3528; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
3529; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
3530; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
3531; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
3532; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm11, %ymm7
3533; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3]
3534; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5,6],ymm7[7]
3535; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
3536; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
3537; AVX2-FP-NEXT:    vpblendvb %ymm9, %ymm0, %ymm14, %ymm0
3538; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3539; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
3540; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3541; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm9[4],ymm7[5],ymm9[5],ymm7[6],ymm9[6],ymm7[7],ymm9[7],ymm7[12],ymm9[12],ymm7[13],ymm9[13],ymm7[14],ymm9[14],ymm7[15],ymm9[15]
3542; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3]
3543; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
3544; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
3545; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm15 = ymm15[4],ymm14[4],ymm15[5],ymm14[5],ymm15[6],ymm14[6],ymm15[7],ymm14[7],ymm15[12],ymm14[12],ymm15[13],ymm14[13],ymm15[14],ymm14[14],ymm15[15],ymm14[15]
3546; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[1,2,3,3,5,6,7,7]
3547; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
3548; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7]
3549; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
3550; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm15, %ymm1
3551; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
3552; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
3553; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3554; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
3555; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
3556; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm15 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
3557; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm0, %ymm1, %ymm0
3558; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3559; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
3560; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3561; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
3562; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2]
3563; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
3564; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
3565; AVX2-FP-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
3566; AVX2-FP-NEXT:    vpbroadcastq %xmm1, %ymm1
3567; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
3568; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7]
3569; AVX2-FP-NEXT:    vpbroadcastq %xmm0, %ymm4
3570; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
3571; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm4, %ymm4
3572; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3573; AVX2-FP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
3574; AVX2-FP-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
3575; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3576; AVX2-FP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload
3577; AVX2-FP-NEXT:    # xmm15 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
3578; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
3579; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[1,0,2,2]
3580; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1]
3581; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6],ymm15[7]
3582; AVX2-FP-NEXT:    vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3583; AVX2-FP-NEXT:    # xmm15 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
3584; AVX2-FP-NEXT:    vpbroadcastq %xmm15, %ymm15
3585; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm15[2],ymm1[3,4],ymm15[5],ymm1[6,7]
3586; AVX2-FP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3587; AVX2-FP-NEXT:    # xmm15 = mem[0,0,2,1,4,5,6,7]
3588; AVX2-FP-NEXT:    vpbroadcastq %xmm15, %ymm15
3589; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm15, %ymm1
3590; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3591; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm15 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[2],ymm13[2],ymm2[3],ymm13[3],ymm2[8],ymm13[8],ymm2[9],ymm13[9],ymm2[10],ymm13[10],ymm2[11],ymm13[11]
3592; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm12 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[8],ymm10[8],ymm12[9],ymm10[9],ymm12[10],ymm10[10],ymm12[11],ymm10[11]
3593; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm13 = ymm15[2,2,2,3]
3594; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[1,0,2,2,5,4,6,6]
3595; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3]
3596; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6],ymm12[7]
3597; AVX2-FP-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
3598; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
3599; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7]
3600; AVX2-FP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
3601; AVX2-FP-NEXT:    # ymm10 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
3602; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2]
3603; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm11, %ymm10, %ymm10
3604; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[8],ymm9[8],ymm7[9],ymm9[9],ymm7[10],ymm9[10],ymm7[11],ymm9[11]
3605; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3606; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm2[0],ymm14[0],ymm2[1],ymm14[1],ymm2[2],ymm14[2],ymm2[3],ymm14[3],ymm2[8],ymm14[8],ymm2[9],ymm14[9],ymm2[10],ymm14[10],ymm2[11],ymm14[11]
3607; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm8[2,2,2,3]
3608; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[1,0,2,2,5,4,6,6]
3609; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3]
3610; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7]
3611; AVX2-FP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
3612; AVX2-FP-NEXT:    # ymm3 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
3613; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
3614; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7]
3615; AVX2-FP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
3616; AVX2-FP-NEXT:    # ymm2 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
3617; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
3618; AVX2-FP-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm0
3619; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3620; AVX2-FP-NEXT:    vmovdqa %ymm0, 96(%rax)
3621; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3622; AVX2-FP-NEXT:    vmovaps %ymm0, 160(%rax)
3623; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3624; AVX2-FP-NEXT:    vmovaps %ymm0, 128(%rax)
3625; AVX2-FP-NEXT:    vmovdqa %ymm10, 288(%rax)
3626; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3627; AVX2-FP-NEXT:    vmovaps %ymm0, 352(%rax)
3628; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3629; AVX2-FP-NEXT:    vmovaps %ymm0, 320(%rax)
3630; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3631; AVX2-FP-NEXT:    vmovaps %ymm0, 64(%rax)
3632; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3633; AVX2-FP-NEXT:    vmovaps %ymm0, 224(%rax)
3634; AVX2-FP-NEXT:    vmovdqa %ymm1, 192(%rax)
3635; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3636; AVX2-FP-NEXT:    vmovaps %ymm0, 256(%rax)
3637; AVX2-FP-NEXT:    vmovdqa %ymm4, (%rax)
3638; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3639; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%rax)
3640; AVX2-FP-NEXT:    addq $648, %rsp # imm = 0x288
3641; AVX2-FP-NEXT:    vzeroupper
3642; AVX2-FP-NEXT:    retq
3643;
3644; AVX2-FCP-LABEL: store_i16_stride6_vf32:
3645; AVX2-FCP:       # %bb.0:
3646; AVX2-FCP-NEXT:    subq $648, %rsp # imm = 0x288
3647; AVX2-FCP-NEXT:    vmovdqa (%rsi), %xmm1
3648; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3649; AVX2-FCP-NEXT:    vmovdqa 32(%rsi), %xmm14
3650; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
3651; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
3652; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm2
3653; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3654; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm12
3655; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
3656; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
3657; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
3658; AVX2-FCP-NEXT:    vmovdqa (%rcx), %xmm2
3659; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3660; AVX2-FCP-NEXT:    vmovdqa 32(%rcx), %xmm6
3661; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
3662; AVX2-FCP-NEXT:    vmovdqa (%rdx), %xmm3
3663; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3664; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %xmm7
3665; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
3666; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
3667; AVX2-FCP-NEXT:    vpbroadcastq %xmm2, %ymm2
3668; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
3669; AVX2-FCP-NEXT:    vmovdqa (%r8), %xmm2
3670; AVX2-FCP-NEXT:    vmovdqa %xmm2, (%rsp) # 16-byte Spill
3671; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7]
3672; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
3673; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
3674; AVX2-FCP-NEXT:    vmovdqa (%r9), %xmm3
3675; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3676; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
3677; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
3678; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
3679; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
3680; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm1, %ymm3, %ymm1
3681; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3682; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm14, %xmm1
3683; AVX2-FCP-NEXT:    vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3684; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm12, %xmm0
3685; AVX2-FCP-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3686; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3687; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
3688; AVX2-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3689; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
3690; AVX2-FCP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3691; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
3692; AVX2-FCP-NEXT:    vmovdqa 32(%r8), %xmm8
3693; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
3694; AVX2-FCP-NEXT:    vpbroadcastq %xmm1, %ymm1
3695; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
3696; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm8[2,1,3,3,4,5,6,7]
3697; AVX2-FCP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3698; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
3699; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
3700; AVX2-FCP-NEXT:    vmovdqa 32(%r9), %xmm9
3701; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm9, %xmm1
3702; AVX2-FCP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3703; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
3704; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm0, %ymm1, %ymm0
3705; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3706; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
3707; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3708; AVX2-FCP-NEXT:    vmovdqa 32(%rsi), %ymm1
3709; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3710; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
3711; AVX2-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
3712; AVX2-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
3713; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
3714; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %ymm10
3715; AVX2-FCP-NEXT:    vmovdqa 32(%rcx), %ymm4
3716; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
3717; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3718; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
3719; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3720; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
3721; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
3722; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
3723; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
3724; AVX2-FCP-NEXT:    vmovdqa 32(%r8), %ymm2
3725; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3726; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
3727; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
3728; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
3729; AVX2-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
3730; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3731; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
3732; AVX2-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
3733; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm3
3734; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
3735; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm1, %ymm3, %ymm1
3736; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3737; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm3
3738; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3739; AVX2-FCP-NEXT:    vmovdqa (%rsi), %ymm1
3740; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3741; AVX2-FCP-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
3742; AVX2-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
3743; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
3744; AVX2-FCP-NEXT:    vmovdqa (%rdx), %ymm5
3745; AVX2-FCP-NEXT:    vmovdqa (%rcx), %ymm1
3746; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3747; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
3748; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
3749; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11]
3750; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
3751; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
3752; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
3753; AVX2-FCP-NEXT:    vmovdqa (%r8), %ymm1
3754; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3755; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
3756; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
3757; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
3758; AVX2-FCP-NEXT:    vmovdqa (%r9), %ymm1
3759; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3760; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm2
3761; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
3762; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm0, %ymm2, %ymm0
3763; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3764; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
3765; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [1,2,1,2,0,0,3,3]
3766; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm2, %ymm0
3767; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
3768; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1]
3769; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7]
3770; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
3771; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm8, %xmm13
3772; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1]
3773; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm0[0],ymm13[1],ymm0[2,3],ymm13[4],ymm0[5,6],ymm13[7]
3774; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
3775; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm9, %xmm1
3776; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm1[0,1,0,1]
3777; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
3778; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm13, %ymm0, %ymm0
3779; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3780; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3781; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3782; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
3783; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm2, %ymm0
3784; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3785; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3786; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
3787; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1]
3788; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
3789; AVX2-FCP-NEXT:    vmovdqa (%rsp), %xmm12 # 16-byte Reload
3790; AVX2-FCP-NEXT:    vpshufb %xmm6, %xmm12, %xmm2
3791; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
3792; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
3793; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
3794; AVX2-FCP-NEXT:    vpshufb %xmm3, %xmm11, %xmm2
3795; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
3796; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm0, %ymm2, %ymm0
3797; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3798; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm4[4],ymm10[5],ymm4[5],ymm10[6],ymm4[6],ymm10[7],ymm4[7],ymm10[12],ymm4[12],ymm10[13],ymm4[13],ymm10[14],ymm4[14],ymm10[15],ymm4[15]
3799; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [5,6,5,6,5,6,7,7]
3800; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm2, %ymm0
3801; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3802; AVX2-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
3803; AVX2-FCP-NEXT:    # ymm3 = ymm3[4],mem[4],ymm3[5],mem[5],ymm3[6],mem[6],ymm3[7],mem[7],ymm3[12],mem[12],ymm3[13],mem[13],ymm3[14],mem[14],ymm3[15],mem[15]
3804; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3]
3805; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
3806; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
3807; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3808; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm4, %ymm6
3809; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
3810; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6],ymm6[7]
3811; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
3812; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3813; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm4, %ymm13
3814; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3]
3815; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm0, %ymm13, %ymm0
3816; AVX2-FCP-NEXT:    vmovdqa %ymm8, %ymm13
3817; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3818; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
3819; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15]
3820; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm2, %ymm0
3821; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
3822; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3823; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15]
3824; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3]
3825; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
3826; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
3827; AVX2-FCP-NEXT:    vpshufb %ymm10, %ymm9, %ymm2
3828; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
3829; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
3830; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
3831; AVX2-FCP-NEXT:    vpshufb %ymm3, %ymm10, %ymm2
3832; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
3833; AVX2-FCP-NEXT:    vpblendvb %ymm13, %ymm0, %ymm2, %ymm13
3834; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
3835; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
3836; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,0,2,2,1,0,2,2]
3837; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,0,1]
3838; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm3, %ymm0
3839; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
3840; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
3841; AVX2-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero
3842; AVX2-FCP-NEXT:    vpbroadcastq %xmm1, %ymm1
3843; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
3844; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7]
3845; AVX2-FCP-NEXT:    vpbroadcastq %xmm0, %ymm2
3846; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
3847; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm2
3848; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3849; AVX2-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
3850; AVX2-FCP-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
3851; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm1
3852; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3853; AVX2-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
3854; AVX2-FCP-NEXT:    # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
3855; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
3856; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7]
3857; AVX2-FCP-NEXT:    vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3858; AVX2-FCP-NEXT:    # xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
3859; AVX2-FCP-NEXT:    vpbroadcastq %xmm3, %ymm3
3860; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
3861; AVX2-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3862; AVX2-FCP-NEXT:    # xmm3 = mem[0,0,2,1,4,5,6,7]
3863; AVX2-FCP-NEXT:    vpbroadcastq %xmm3, %ymm3
3864; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm3, %ymm1
3865; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3866; AVX2-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
3867; AVX2-FCP-NEXT:    # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11]
3868; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3869; AVX2-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm14 # 32-byte Folded Reload
3870; AVX2-FCP-NEXT:    # ymm14 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[2],mem[2],ymm7[3],mem[3],ymm7[8],mem[8],ymm7[9],mem[9],ymm7[10],mem[10],ymm7[11],mem[11]
3871; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [5,4,2,2,5,4,6,6]
3872; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm15, %ymm3
3873; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
3874; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5,6],ymm3[7]
3875; AVX2-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
3876; AVX2-FCP-NEXT:    # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
3877; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2]
3878; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm12[2],ymm3[3,4],ymm12[5],ymm3[6,7]
3879; AVX2-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
3880; AVX2-FCP-NEXT:    # ymm11 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
3881; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
3882; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm3, %ymm11, %ymm3
3883; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11]
3884; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm15, %ymm7
3885; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11]
3886; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
3887; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7]
3888; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
3889; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2]
3890; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
3891; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
3892; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
3893; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm5, %ymm4, %ymm0
3894; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3895; AVX2-FCP-NEXT:    vmovdqa %ymm0, 96(%rax)
3896; AVX2-FCP-NEXT:    vmovdqa %ymm13, 160(%rax)
3897; AVX2-FCP-NEXT:    vmovdqa %ymm3, 288(%rax)
3898; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3899; AVX2-FCP-NEXT:    vmovaps %ymm0, 352(%rax)
3900; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3901; AVX2-FCP-NEXT:    vmovaps %ymm0, 64(%rax)
3902; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3903; AVX2-FCP-NEXT:    vmovaps %ymm0, 128(%rax)
3904; AVX2-FCP-NEXT:    vmovdqa %ymm1, 192(%rax)
3905; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3906; AVX2-FCP-NEXT:    vmovaps %ymm0, 256(%rax)
3907; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3908; AVX2-FCP-NEXT:    vmovaps %ymm0, 320(%rax)
3909; AVX2-FCP-NEXT:    vmovdqa %ymm2, (%rax)
3910; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3911; AVX2-FCP-NEXT:    vmovaps %ymm0, 224(%rax)
3912; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3913; AVX2-FCP-NEXT:    vmovaps %ymm0, 32(%rax)
3914; AVX2-FCP-NEXT:    addq $648, %rsp # imm = 0x288
3915; AVX2-FCP-NEXT:    vzeroupper
3916; AVX2-FCP-NEXT:    retq
3917;
3918; AVX512-LABEL: store_i16_stride6_vf32:
3919; AVX512:       # %bb.0:
3920; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm4
3921; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
3922; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm8
3923; AVX512-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
3924; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
3925; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
3926; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15]
3927; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7]
3928; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
3929; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3930; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm9
3931; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm9[2,1,2,3,6,5,6,7]
3932; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
3933; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm11
3934; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[2,1,2,3,6,5,6,7]
3935; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
3936; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
3937; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
3938; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm9[4],ymm11[5],ymm9[5],ymm11[6],ymm9[6],ymm11[7],ymm9[7],ymm11[12],ymm9[12],ymm11[13],ymm9[13],ymm11[14],ymm9[14],ymm11[15],ymm9[15]
3939; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3]
3940; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm2
3941; AVX512-NEXT:    movw $18724, %ax # imm = 0x4924
3942; AVX512-NEXT:    kmovw %eax, %k1
3943; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
3944; AVX512-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
3945; AVX512-NEXT:    vmovdqa 32(%r8), %ymm12
3946; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
3947; AVX512-NEXT:    vpshufb %ymm1, %ymm12, %ymm3
3948; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
3949; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7]
3950; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
3951; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
3952; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
3953; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm16
3954; AVX512-NEXT:    vmovdqa (%rcx), %ymm2
3955; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
3956; AVX512-NEXT:    vmovdqa (%rdx), %ymm3
3957; AVX512-NEXT:    vpsrldq {{.*#+}} ymm5 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
3958; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11]
3959; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
3960; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
3961; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7]
3962; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
3963; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
3964; AVX512-NEXT:    vmovdqa (%rsi), %ymm0
3965; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm0[2,1,2,3,6,5,6,7]
3966; AVX512-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
3967; AVX512-NEXT:    vmovdqa (%rdi), %ymm10
3968; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm10[2,1,2,3,6,5,6,7]
3969; AVX512-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
3970; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
3971; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
3972; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm7 = ymm10[4],ymm0[4],ymm10[5],ymm0[5],ymm10[6],ymm0[6],ymm10[7],ymm0[7],ymm10[12],ymm0[12],ymm10[13],ymm0[13],ymm10[14],ymm0[14],ymm10[15],ymm0[15]
3973; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[3,3,3,3]
3974; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
3975; AVX512-NEXT:    vmovdqa32 %zmm5, %zmm6 {%k1}
3976; AVX512-NEXT:    vextracti64x4 $1, %zmm6, %ymm5
3977; AVX512-NEXT:    vmovdqa (%r8), %ymm14
3978; AVX512-NEXT:    vpshufb %ymm1, %ymm14, %ymm1
3979; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
3980; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7]
3981; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm14[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
3982; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
3983; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
3984; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm17
3985; AVX512-NEXT:    vmovdqa (%rcx), %xmm5
3986; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm13
3987; AVX512-NEXT:    vmovdqa (%rdx), %xmm6
3988; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
3989; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
3990; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
3991; AVX512-NEXT:    vpermt2d %zmm1, %zmm18, %zmm2
3992; AVX512-NEXT:    vmovdqa (%rsi), %xmm3
3993; AVX512-NEXT:    vmovdqa (%rdi), %xmm7
3994; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
3995; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1]
3996; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[8],ymm0[8],ymm10[9],ymm0[9],ymm10[10],ymm0[10],ymm10[11],ymm0[11]
3997; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
3998; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
3999; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
4000; AVX512-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
4001; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
4002; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
4003; AVX512-NEXT:    vpblendd {{.*#+}} ymm14 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
4004; AVX512-NEXT:    vmovdqa (%r8), %xmm10
4005; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
4006; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm0
4007; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
4008; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
4009; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm0
4010; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm2, %zmm2
4011; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm14
4012; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm15
4013; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11]
4014; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
4015; AVX512-NEXT:    vpermt2d %zmm8, %zmm18, %zmm4
4016; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
4017; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1]
4018; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11]
4019; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
4020; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
4021; AVX512-NEXT:    vmovdqa32 %zmm8, %zmm4 {%k1}
4022; AVX512-NEXT:    vextracti64x4 $1, %zmm4, %ymm8
4023; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
4024; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2]
4025; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
4026; AVX512-NEXT:    vmovdqa 32(%r8), %xmm12
4027; AVX512-NEXT:    vpshufb %xmm1, %xmm12, %xmm1
4028; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
4029; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6],ymm1[7]
4030; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm1, %zmm4
4031; AVX512-NEXT:    vmovdqa 32(%r9), %ymm11
4032; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
4033; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
4034; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm1[2,2,2,3]
4035; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm11[2,3,2,3,6,7,6,7]
4036; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
4037; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm1[2,1,2,3]
4038; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
4039; AVX512-NEXT:    vpsrldq {{.*#+}} xmm13 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4040; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4041; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
4042; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
4043; AVX512-NEXT:    vpermt2d %zmm0, %zmm18, %zmm1
4044; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm14[0,1,2,1]
4045; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
4046; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm15[0,1,2,1]
4047; AVX512-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,5]
4048; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
4049; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
4050; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1]
4051; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm13[0,1,2,3],zmm0[0,1,0,1]
4052; AVX512-NEXT:    movw $9362, %ax # imm = 0x2492
4053; AVX512-NEXT:    kmovw %eax, %k1
4054; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
4055; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
4056; AVX512-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm12[2,1,3,3,4,5,6,7]
4057; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1]
4058; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0],ymm1[1,2],ymm13[3],ymm1[4,5],ymm13[6],ymm1[7]
4059; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero
4060; AVX512-NEXT:    vpbroadcastq %xmm12, %ymm12
4061; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7]
4062; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm12
4063; AVX512-NEXT:    vmovdqa (%r9), %ymm0
4064; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
4065; AVX512-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
4066; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm1[2,2,2,3]
4067; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7]
4068; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
4069; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm1[2,1,2,3]
4070; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
4071; AVX512-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4072; AVX512-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4073; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
4074; AVX512-NEXT:    vpermt2d %zmm5, %zmm18, %zmm1
4075; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[0,1,2,1]
4076; AVX512-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5]
4077; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm7[0,1,2,1]
4078; AVX512-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5]
4079; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
4080; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
4081; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
4082; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm5[0,1,0,1]
4083; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm3 {%k1}
4084; AVX512-NEXT:    vextracti64x4 $1, %zmm3, %ymm1
4085; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm10[2,1,3,3,4,5,6,7]
4086; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
4087; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7]
4088; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm5 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
4089; AVX512-NEXT:    vpbroadcastq %xmm5, %ymm5
4090; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
4091; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
4092; AVX512-NEXT:    vmovdqa (%r9), %xmm3
4093; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3]
4094; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,1,4,5,6,7]
4095; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
4096; AVX512-NEXT:    vmovdqa 32(%r9), %xmm6
4097; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
4098; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
4099; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
4100; AVX512-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,1,4,5,6,7]
4101; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1]
4102; AVX512-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
4103; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2]
4104; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm6[0,0,2,1,4,5,6,7]
4105; AVX512-NEXT:    vpbroadcastq %xmm11, %ymm11
4106; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
4107; AVX512-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
4108; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
4109; AVX512-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm3[0,0,2,1,4,5,6,7]
4110; AVX512-NEXT:    vpbroadcastq %xmm15, %ymm15
4111; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
4112; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
4113; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
4114; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4115; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm8
4116; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
4117; AVX512-NEXT:    vpternlogd {{.*#+}} zmm8 = zmm8 ^ (zmm9 & (zmm8 ^ zmm16))
4118; AVX512-NEXT:    vinserti64x4 $1, %ymm14, %zmm13, %zmm13
4119; AVX512-NEXT:    vpternlogd {{.*#+}} zmm13 = zmm13 ^ (zmm9 & (zmm13 ^ zmm17))
4120; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm5, %zmm0
4121; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
4122; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (zmm5 & (zmm0 ^ zmm2))
4123; AVX512-NEXT:    vinserti64x4 $1, %ymm10, %zmm7, %zmm2
4124; AVX512-NEXT:    vpternlogd {{.*#+}} zmm2 = zmm2 ^ (zmm5 & (zmm2 ^ zmm4))
4125; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm11, %zmm4
4126; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
4127; AVX512-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm4 ^ (zmm5 & (zmm4 ^ zmm12))
4128; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm15, %zmm3
4129; AVX512-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (zmm5 & (zmm3 ^ zmm1))
4130; AVX512-NEXT:    vmovdqa64 %zmm3, (%rax)
4131; AVX512-NEXT:    vmovdqa64 %zmm4, 192(%rax)
4132; AVX512-NEXT:    vmovdqa64 %zmm2, 256(%rax)
4133; AVX512-NEXT:    vmovdqa64 %zmm0, 64(%rax)
4134; AVX512-NEXT:    vmovdqa64 %zmm13, 128(%rax)
4135; AVX512-NEXT:    vmovdqa64 %zmm8, 320(%rax)
4136; AVX512-NEXT:    vzeroupper
4137; AVX512-NEXT:    retq
4138;
4139; AVX512-FCP-LABEL: store_i16_stride6_vf32:
4140; AVX512-FCP:       # %bb.0:
4141; AVX512-FCP-NEXT:    pushq %rax
4142; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm2
4143; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
4144; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm1
4145; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm4
4146; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm3
4147; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11]
4148; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15]
4149; AVX512-FCP-NEXT:    vmovdqa64 %ymm4, %ymm23
4150; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm24
4151; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm6 = [2,1,2,3,11,11,11,11]
4152; AVX512-FCP-NEXT:    vpermt2q %zmm3, %zmm6, %zmm1
4153; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm11
4154; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm13
4155; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15]
4156; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [5,6,5,6,5,6,7,7]
4157; AVX512-FCP-NEXT:    vpermd %ymm3, %ymm10, %ymm3
4158; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
4159; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm5 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
4160; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11]
4161; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
4162; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
4163; AVX512-FCP-NEXT:    movw $18724, %ax # imm = 0x4924
4164; AVX512-FCP-NEXT:    kmovw %eax, %k1
4165; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm1 {%k1}
4166; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm2
4167; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
4168; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm12 = [12,1,2,13,4,5,14,7]
4169; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm5
4170; AVX512-FCP-NEXT:    vpermt2d %ymm3, %ymm12, %ymm5
4171; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [8,21,10,11,20,13,14,23]
4172; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
4173; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm2, %ymm3
4174; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm25
4175; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm17, %zmm1
4176; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm16
4177; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
4178; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
4179; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm3, %ymm1
4180; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
4181; AVX512-FCP-NEXT:    # ymm15 = mem[0,1,0,1]
4182; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm3, %ymm2
4183; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm26
4184; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [2,2,0,3,10,0,10,11]
4185; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm18, %zmm2
4186; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4187; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm3
4188; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm1
4189; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm2
4190; AVX512-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
4191; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
4192; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
4193; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm27
4194; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm6, %zmm0
4195; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm1
4196; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm6
4197; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm6[4],ymm1[4],ymm6[5],ymm1[5],ymm6[6],ymm1[6],ymm6[7],ymm1[7],ymm6[12],ymm1[12],ymm6[13],ymm1[13],ymm6[14],ymm1[14],ymm6[15],ymm1[15]
4198; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm10, %ymm2
4199; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm10 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
4200; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
4201; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm10[0],ymm4[1],ymm10[1],ymm4[2],ymm10[2],ymm4[3],ymm10[3],ymm4[8],ymm10[8],ymm4[9],ymm10[9],ymm4[10],ymm10[10],ymm4[11],ymm10[11]
4202; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
4203; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm4, %zmm2
4204; AVX512-FCP-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
4205; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm10
4206; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
4207; AVX512-FCP-NEXT:    vpermi2d %ymm2, %ymm0, %ymm12
4208; AVX512-FCP-NEXT:    vpshufb %ymm7, %ymm10, %ymm2
4209; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm17, %zmm0
4210; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm12, %zmm17
4211; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm12
4212; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm12, %ymm0
4213; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm12, %ymm2
4214; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm2
4215; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4216; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm14
4217; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm15
4218; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[8],ymm1[8],ymm6[9],ymm1[9],ymm6[10],ymm1[10],ymm6[11],ymm1[11]
4219; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
4220; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm18
4221; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm1
4222; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm6
4223; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11]
4224; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
4225; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm20
4226; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm13
4227; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
4228; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm13, %xmm4
4229; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm0
4230; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm5
4231; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
4232; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm22 = [0,0,2,1,8,9,8,9]
4233; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
4234; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm22, %zmm11
4235; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2]
4236; AVX512-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
4237; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
4238; AVX512-FCP-NEXT:    vpermd %ymm8, %ymm4, %ymm8
4239; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4240; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4241; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
4242; AVX512-FCP-NEXT:    vpbroadcastq %xmm1, %ymm1
4243; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm8, %zmm1
4244; AVX512-FCP-NEXT:    movw $9362, %ax # imm = 0x2492
4245; AVX512-FCP-NEXT:    kmovw %eax, %k2
4246; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm11 {%k2}
4247; AVX512-FCP-NEXT:    vmovdqa %ymm11, %ymm6
4248; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [16,9,10,17,12,13,18,15]
4249; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm1
4250; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm1[2,1,3,3,4,5,6,7]
4251; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm8, %zmm11
4252; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
4253; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [0,1,8,3,4,9,6,7]
4254; AVX512-FCP-NEXT:    vpermt2d %ymm9, %ymm21, %ymm6
4255; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm6, %zmm19
4256; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm9
4257; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm11
4258; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm9, %xmm6
4259; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm11, %xmm2
4260; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
4261; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
4262; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm22, %zmm5
4263; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
4264; AVX512-FCP-NEXT:    vpermd %ymm2, %ymm4, %ymm2
4265; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4266; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4267; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
4268; AVX512-FCP-NEXT:    vpbroadcastq %xmm4, %ymm4
4269; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
4270; AVX512-FCP-NEXT:    vmovdqa32 %zmm2, %zmm5 {%k2}
4271; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm2
4272; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
4273; AVX512-FCP-NEXT:    vpermi2d %ymm4, %ymm5, %ymm21
4274; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[2,1,3,3,4,5,6,7]
4275; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm8, %zmm5
4276; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,0,0,0,8,8,0,9]
4277; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm6
4278; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
4279; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm15
4280; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm6[0,0,2,1,4,5,6,7]
4281; AVX512-FCP-NEXT:    vpermt2q %zmm15, %zmm4, %zmm14
4282; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm21, %zmm21
4283; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm5
4284; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm8
4285; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm5[0,0,2,1,4,5,6,7]
4286; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm4, %zmm15
4287; AVX512-FCP-NEXT:    vmovdqa64 %ymm27, %ymm4
4288; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
4289; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
4290; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm9 = [1,1,1,1,10,10,10,11]
4291; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm9, %zmm8
4292; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14]
4293; AVX512-FCP-NEXT:    vpermd %zmm18, %zmm4, %zmm11
4294; AVX512-FCP-NEXT:    vmovdqa32 %zmm8, %zmm11 {%k1}
4295; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
4296; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
4297; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [0,9,2,3,8,5,6,11]
4298; AVX512-FCP-NEXT:    vmovdqa64 %ymm11, %ymm22
4299; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm18, %ymm22
4300; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [8,9,20,11,12,21,14,15]
4301; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
4302; AVX512-FCP-NEXT:    vpermt2d %zmm8, %zmm2, %zmm11
4303; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
4304; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm12 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
4305; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm5, %xmm8
4306; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [0,0,0,1,0,10,10,0]
4307; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm5, %zmm8
4308; AVX512-FCP-NEXT:    vmovdqa64 %ymm23, %ymm10
4309; AVX512-FCP-NEXT:    vmovdqa64 %ymm24, %ymm7
4310; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[8],ymm7[8],ymm10[9],ymm7[9],ymm10[10],ymm7[10],ymm10[11],ymm7[11]
4311; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
4312; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm9, %zmm0
4313; AVX512-FCP-NEXT:    vpermd %zmm20, %zmm4, %zmm4
4314; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm4 {%k1}
4315; AVX512-FCP-NEXT:    vpshufb %xmm3, %xmm1, %xmm0
4316; AVX512-FCP-NEXT:    vpermi2d %ymm0, %ymm4, %ymm18
4317; AVX512-FCP-NEXT:    vmovdqa64 %ymm25, %ymm0
4318; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
4319; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm4
4320; AVX512-FCP-NEXT:    vpshufb %xmm12, %xmm6, %xmm0
4321; AVX512-FCP-NEXT:    vmovdqa64 %ymm26, %ymm1
4322; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
4323; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm5, %zmm0
4324; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm18, %zmm1
4325; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
4326; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (zmm2 & (zmm0 ^ zmm1))
4327; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4328; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 256(%rax)
4329; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm22, %zmm0
4330; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm8 = zmm8 ^ (zmm2 & (zmm8 ^ zmm0))
4331; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 64(%rax)
4332; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
4333; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm15 = zmm15 ^ (zmm0 & (zmm15 ^ zmm21))
4334; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, (%rax)
4335; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm14 = zmm14 ^ (zmm0 & (zmm14 ^ zmm19))
4336; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, 192(%rax)
4337; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
4338; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
4339; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 ^ (zmm0 & (zmm1 ^ zmm17))
4340; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 128(%rax)
4341; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
4342; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 ^ (zmm0 & (zmm1 ^ zmm16))
4343; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 320(%rax)
4344; AVX512-FCP-NEXT:    popq %rax
4345; AVX512-FCP-NEXT:    vzeroupper
4346; AVX512-FCP-NEXT:    retq
4347;
4348; AVX512DQ-LABEL: store_i16_stride6_vf32:
4349; AVX512DQ:       # %bb.0:
4350; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm6
4351; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm2
4352; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm9
4353; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm3
4354; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
4355; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4356; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm25
4357; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4358; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm26
4359; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
4360; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
4361; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm2, %zmm0
4362; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm8
4363; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm4
4364; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm4[0,1,2,1]
4365; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
4366; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm10
4367; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm5
4368; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm5[0,1,2,1]
4369; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
4370; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
4371; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
4372; AVX512DQ-NEXT:    vmovdqa64 %xmm5, %xmm27
4373; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm28
4374; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
4375; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[0,1,0,1]
4376; AVX512DQ-NEXT:    movw $9362, %ax # imm = 0x2492
4377; AVX512DQ-NEXT:    kmovw %eax, %k1
4378; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
4379; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
4380; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm11
4381; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm4
4382; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7]
4383; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
4384; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7]
4385; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
4386; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm29
4387; AVX512DQ-NEXT:    vpbroadcastq %xmm3, %ymm3
4388; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
4389; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm16
4390; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
4391; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4392; AVX512DQ-NEXT:    vmovdqa64 %xmm6, %xmm24
4393; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4394; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
4395; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm2, %zmm0
4396; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm8[0,1,2,1]
4397; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
4398; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm10[0,1,2,1]
4399; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
4400; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
4401; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
4402; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm31
4403; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
4404; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[0,1,0,1]
4405; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
4406; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
4407; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm11[2,1,3,3,4,5,6,7]
4408; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
4409; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7]
4410; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero
4411; AVX512DQ-NEXT:    vpbroadcastq %xmm2, %ymm2
4412; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
4413; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm17
4414; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm13
4415; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm13[2,1,2,3,6,5,6,7]
4416; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
4417; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm14
4418; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[2,1,2,3,6,5,6,7]
4419; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
4420; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
4421; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
4422; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15]
4423; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
4424; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm15
4425; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
4426; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm7
4427; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
4428; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
4429; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4430; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm2[2,2,2,2]
4431; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm15[4],ymm7[5],ymm15[5],ymm7[6],ymm15[6],ymm7[7],ymm15[7],ymm7[12],ymm15[12],ymm7[13],ymm15[13],ymm7[14],ymm15[14],ymm7[15],ymm15[15]
4432; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,2,3,3,5,6,7,7]
4433; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
4434; AVX512DQ-NEXT:    movw $18724, %ax # imm = 0x4924
4435; AVX512DQ-NEXT:    kmovw %eax, %k1
4436; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm2, %zmm1, %zmm0 {%k1}
4437; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm6
4438; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
4439; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm6, %ymm1
4440; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
4441; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
4442; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
4443; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
4444; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
4445; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7]
4446; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm22
4447; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm4
4448; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm4[2,1,2,3,6,5,6,7]
4449; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
4450; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm3
4451; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7]
4452; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
4453; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
4454; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm18 = ymm0[2,1,2,3]
4455; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
4456; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm0[3,3,3,3]
4457; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm2
4458; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
4459; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm0
4460; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm8 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
4461; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11]
4462; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
4463; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm8 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
4464; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[1,2,3,3,5,6,7,7]
4465; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
4466; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm18, %zmm12
4467; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm8, %zmm1, %zmm12 {%k1}
4468; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm1
4469; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm1, %ymm5
4470; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3]
4471; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm12, %ymm8
4472; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5,6],ymm5[7]
4473; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
4474; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
4475; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0],ymm12[1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7]
4476; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm5, %zmm8, %zmm18
4477; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm12
4478; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm12[0,0,2,1,4,5,6,7]
4479; AVX512DQ-NEXT:    vpbroadcastq %xmm5, %ymm19
4480; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm12[0,2,2,3,4,5,6,7]
4481; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
4482; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm20 = ymm5[0,0,2,1]
4483; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm5
4484; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm5[0,0,2,1,4,5,6,7]
4485; AVX512DQ-NEXT:    vpbroadcastq %xmm8, %ymm21
4486; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm5[0,2,2,3,4,5,6,7]
4487; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4]
4488; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm23 = ymm8[0,0,2,1]
4489; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm8
4490; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
4491; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
4492; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm24 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
4493; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm24, %zmm0
4494; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm2
4495; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
4496; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1]
4497; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
4498; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
4499; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1}
4500; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
4501; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
4502; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
4503; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
4504; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
4505; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm11, %xmm2
4506; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
4507; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
4508; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2
4509; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm1
4510; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
4511; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
4512; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
4513; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm1[2,3,2,3,6,7,6,7]
4514; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
4515; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3]
4516; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm8
4517; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm9
4518; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
4519; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm15[0],ymm7[1],ymm15[1],ymm7[2],ymm15[2],ymm7[3],ymm15[3],ymm7[8],ymm15[8],ymm7[9],ymm15[9],ymm7[10],ymm15[10],ymm7[11],ymm15[11]
4520; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm24, %zmm7
4521; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm8
4522; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm9
4523; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
4524; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1]
4525; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11]
4526; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
4527; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm9, %zmm8, %zmm7 {%k1}
4528; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm7, %ymm8
4529; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
4530; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2]
4531; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7]
4532; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm8
4533; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm8, %xmm3
4534; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1]
4535; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3],ymm3[4],ymm7[5,6],ymm3[7]
4536; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm3, %zmm3
4537; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm6
4538; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
4539; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
4540; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
4541; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm8 = ymm6[2,3,2,3,6,7,6,7]
4542; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
4543; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3]
4544; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
4545; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,1,4,5,6,7]
4546; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
4547; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
4548; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2]
4549; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm12[2,3,2,3]
4550; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,1,4,5,6,7]
4551; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
4552; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
4553; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
4554; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4555; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm20, %zmm19, %zmm10
4556; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
4557; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm10 = zmm10 ^ (zmm11 & (zmm10 ^ zmm16))
4558; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm23, %zmm21, %zmm12
4559; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm12 = zmm12 ^ (zmm11 & (zmm12 ^ zmm17))
4560; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
4561; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
4562; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (zmm4 & (zmm0 ^ zmm22))
4563; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm7, %zmm7
4564; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm7 = zmm7 ^ (zmm4 & (zmm7 ^ zmm18))
4565; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm4
4566; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
4567; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm4 ^ (zmm5 & (zmm4 ^ zmm2))
4568; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm9, %zmm1
4569; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 ^ (zmm5 & (zmm1 ^ zmm3))
4570; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 256(%rax)
4571; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 64(%rax)
4572; AVX512DQ-NEXT:    vmovdqa64 %zmm12, (%rax)
4573; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 192(%rax)
4574; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 128(%rax)
4575; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 320(%rax)
4576; AVX512DQ-NEXT:    vzeroupper
4577; AVX512DQ-NEXT:    retq
4578;
4579; AVX512DQ-FCP-LABEL: store_i16_stride6_vf32:
4580; AVX512DQ-FCP:       # %bb.0:
4581; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm1
4582; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm2
4583; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
4584; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm31
4585; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm25
4586; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm2
4587; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4588; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm1
4589; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4590; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
4591; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [1,1,1,1,10,10,10,11]
4592; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm1
4593; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm2
4594; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm3
4595; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
4596; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm3, %ymm26
4597; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm27
4598; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm3
4599; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4600; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm5
4601; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm2
4602; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4603; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm6
4604; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
4605; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm2
4606; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14]
4607; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm20, %zmm19
4608; AVX512DQ-FCP-NEXT:    movw $18724, %ax # imm = 0x4924
4609; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
4610; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm1, %zmm19 {%k1}
4611; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm0
4612; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4613; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
4614; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm0, %xmm1
4615; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [0,9,2,3,8,5,6,11]
4616; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm19, %ymm23
4617; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm17, %ymm23
4618; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [8,9,20,11,12,21,14,15]
4619; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm9
4620; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm4
4621; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[8],ymm9[8],ymm4[9],ymm9[9],ymm4[10],ymm9[10],ymm4[11],ymm9[11]
4622; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
4623; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm6, %xmm30
4624; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm5, %xmm29
4625; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm5
4626; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm7
4627; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
4628; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm21, %zmm19
4629; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm8
4630; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm6 = [0,0,0,1,0,10,10,0]
4631; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
4632; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm2
4633; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
4634; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm15
4635; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm28
4636; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm6, %zmm15
4637; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm11
4638; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm10
4639; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm2
4640; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
4641; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm12 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
4642; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm13 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
4643; AVX512DQ-FCP-NEXT:    vpermt2q %zmm12, %zmm18, %zmm13
4644; AVX512DQ-FCP-NEXT:    vpermd %zmm5, %zmm20, %zmm20
4645; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm13, %zmm20 {%k1}
4646; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm12
4647; AVX512DQ-FCP-NEXT:    vpshufb %xmm3, %xmm12, %xmm3
4648; AVX512DQ-FCP-NEXT:    vpermi2d %ymm3, %ymm20, %ymm17
4649; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm3
4650; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
4651; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm21, %zmm20
4652; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm5
4653; AVX512DQ-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm13
4654; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm5, %xmm24
4655; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm5
4656; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
4657; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm6, %zmm13
4658; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
4659; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm0
4660; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm14
4661; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[8],ymm0[8],ymm14[9],ymm0[9],ymm14[10],ymm0[10],ymm14[11],ymm0[11]
4662; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
4663; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [2,1,2,3,11,11,11,11]
4664; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
4665; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [5,6,5,6,5,6,7,7]
4666; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm9[4],ymm4[5],ymm9[5],ymm4[6],ymm9[6],ymm4[7],ymm9[7],ymm4[12],ymm9[12],ymm4[13],ymm9[13],ymm4[14],ymm9[14],ymm4[15],ymm9[15]
4667; AVX512DQ-FCP-NEXT:    vpermd %ymm1, %ymm21, %ymm1
4668; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm9 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
4669; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
4670; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[8],ymm9[8],ymm4[9],ymm9[9],ymm4[10],ymm9[10],ymm4[11],ymm9[11]
4671; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
4672; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm1, %zmm4, %zmm0 {%k1}
4673; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [12,1,2,13,4,5,14,7]
4674; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm9
4675; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
4676; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm4, %ymm9
4677; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
4678; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
4679; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm1, %ymm16
4680; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [8,21,10,11,20,13,14,23]
4681; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm22, %zmm0
4682; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm9, %zmm18
4683; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm9
4684; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm9, %ymm0
4685; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm1
4686; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm1, %ymm3
4687; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
4688; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm9[4],ymm1[5],ymm9[5],ymm1[6],ymm9[6],ymm1[7],ymm9[7],ymm1[12],ymm9[12],ymm1[13],ymm9[13],ymm1[14],ymm9[14],ymm1[15],ymm9[15]
4689; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
4690; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
4691; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm5, %ymm3
4692; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
4693; AVX512DQ-FCP-NEXT:    # ymm6 = mem[0,1,0,1]
4694; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm5, %ymm14
4695; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [2,2,0,3,10,0,10,11]
4696; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm5, %zmm14
4697; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm26, %ymm1
4698; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm27, %ymm9
4699; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm9[4],ymm1[5],ymm9[5],ymm1[6],ymm9[6],ymm1[7],ymm9[7],ymm1[12],ymm9[12],ymm1[13],ymm9[13],ymm1[14],ymm9[14],ymm1[15],ymm9[15]
4700; AVX512DQ-FCP-NEXT:    vpermd %ymm3, %ymm21, %ymm3
4701; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm9 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
4702; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
4703; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[8],ymm9[8],ymm1[9],ymm9[9],ymm1[10],ymm9[10],ymm1[11],ymm9[11]
4704; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
4705; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm3, %zmm1, %zmm0 {%k1}
4706; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm7[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
4707; AVX512DQ-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm4
4708; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm16, %ymm1
4709; AVX512DQ-FCP-NEXT:    vpshufb %ymm1, %ymm7, %ymm1
4710; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm22, %zmm0
4711; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm4, %zmm0
4712; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm8, %ymm2
4713; AVX512DQ-FCP-NEXT:    vpshufb %ymm6, %ymm8, %ymm1
4714; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm5, %zmm1
4715; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
4716; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm11, %xmm3
4717; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm10, %xmm4
4718; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
4719; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
4720; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm5 = [0,0,2,1,8,9,8,9]
4721; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm5, %zmm3
4722; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2]
4723; AVX512DQ-FCP-NEXT:    # ymm4 = mem[0,1,0,1]
4724; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm29, %xmm7
4725; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm30, %xmm8
4726; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
4727; AVX512DQ-FCP-NEXT:    vpermd %ymm6, %ymm4, %ymm6
4728; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4729; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4730; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
4731; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm7, %ymm7
4732; AVX512DQ-FCP-NEXT:    movw $9362, %ax # imm = 0x2492
4733; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
4734; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm7, %zmm6, %zmm3 {%k1}
4735; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm6 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero
4736; AVX512DQ-FCP-NEXT:    vmovdqa %ymm3, %ymm7
4737; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [16,9,10,17,12,13,18,15]
4738; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm12[2,1,3,3,4,5,6,7]
4739; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm8, %zmm3
4740; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [0,1,8,3,4,9,6,7]
4741; AVX512DQ-FCP-NEXT:    vpermt2d %ymm6, %ymm9, %ymm7
4742; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm7, %zmm3
4743; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4744; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm10, %xmm6
4745; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4746; AVX512DQ-FCP-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
4747; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
4748; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
4749; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
4750; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm5, %zmm6
4751; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm2
4752; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm2, %xmm5
4753; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7]
4754; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm10 = [0,0,0,0,8,8,0,9]
4755; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm10, %zmm2
4756; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4757; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4758; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
4759; AVX512DQ-FCP-NEXT:    vpermd %ymm5, %ymm4, %ymm4
4760; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4761; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm11 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4762; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3]
4763; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm5, %ymm5
4764; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm5, %zmm4, %zmm6 {%k1}
4765; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4766; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
4767; AVX512DQ-FCP-NEXT:    vpermi2d %ymm4, %ymm6, %ymm9
4768; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm5[2,1,3,3,4,5,6,7]
4769; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm8, %zmm6
4770; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm9, %zmm4
4771; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm6
4772; AVX512DQ-FCP-NEXT:    vpshufb %xmm7, %xmm6, %xmm5
4773; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,0,2,1,4,5,6,7]
4774; AVX512DQ-FCP-NEXT:    vpermt2q %zmm5, %zmm10, %zmm6
4775; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
4776; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm6 = zmm6 ^ (zmm5 & (zmm6 ^ zmm4))
4777; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4778; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, (%rax)
4779; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm2 = zmm2 ^ (zmm5 & (zmm2 ^ zmm3))
4780; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 192(%rax)
4781; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
4782; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 ^ (zmm2 & (zmm1 ^ zmm0))
4783; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 128(%rax)
4784; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm14 = zmm14 ^ (zmm2 & (zmm14 ^ zmm18))
4785; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, 320(%rax)
4786; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm20, %zmm17, %zmm0
4787; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
4788; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm13 = zmm13 ^ (zmm1 & (zmm13 ^ zmm0))
4789; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 256(%rax)
4790; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm23, %zmm0
4791; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm15 = zmm15 ^ (zmm1 & (zmm15 ^ zmm0))
4792; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, 64(%rax)
4793; AVX512DQ-FCP-NEXT:    vzeroupper
4794; AVX512DQ-FCP-NEXT:    retq
4795;
4796; AVX512BW-LABEL: store_i16_stride6_vf32:
4797; AVX512BW:       # %bb.0:
4798; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4799; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm2
4800; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm3
4801; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm4
4802; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm5
4803; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm1
4804; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm0
4805; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34]
4806; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
4807; AVX512BW-NEXT:    vpermi2w %zmm5, %zmm4, %zmm6
4808; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37]
4809; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
4810; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm7
4811; AVX512BW-NEXT:    movw $9362, %cx # imm = 0x2492
4812; AVX512BW-NEXT:    kmovd %ecx, %k2
4813; AVX512BW-NEXT:    vmovdqa32 %zmm6, %zmm7 {%k2}
4814; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31]
4815; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm7, %zmm8
4816; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31]
4817; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm8, %zmm6
4818; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0]
4819; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
4820; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm7
4821; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42]
4822; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
4823; AVX512BW-NEXT:    vpermi2w %zmm5, %zmm4, %zmm8
4824; AVX512BW-NEXT:    movw $18724, %cx # imm = 0x4924
4825; AVX512BW-NEXT:    kmovd %ecx, %k1
4826; AVX512BW-NEXT:    vmovdqa32 %zmm7, %zmm8 {%k1}
4827; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31]
4828; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm8, %zmm7
4829; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31]
4830; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm7, %zmm8
4831; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0]
4832; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
4833; AVX512BW-NEXT:    vpermi2w %zmm5, %zmm4, %zmm7
4834; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45]
4835; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
4836; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm9
4837; AVX512BW-NEXT:    vmovdqa32 %zmm7, %zmm9 {%k1}
4838; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0]
4839; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm9, %zmm7
4840; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47]
4841; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm7, %zmm9
4842; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50]
4843; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
4844; AVX512BW-NEXT:    vpermi2w %zmm5, %zmm4, %zmm7
4845; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53]
4846; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
4847; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm10
4848; AVX512BW-NEXT:    vmovdqa32 %zmm7, %zmm10 {%k2}
4849; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31]
4850; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm10, %zmm7
4851; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31]
4852; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm7, %zmm10
4853; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0]
4854; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
4855; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm7
4856; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58]
4857; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
4858; AVX512BW-NEXT:    vpermi2w %zmm5, %zmm4, %zmm11
4859; AVX512BW-NEXT:    vmovdqa32 %zmm7, %zmm11 {%k1}
4860; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31]
4861; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm11, %zmm7
4862; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm11 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31]
4863; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm7, %zmm11
4864; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0]
4865; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
4866; AVX512BW-NEXT:    vpermi2w %zmm5, %zmm4, %zmm7
4867; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61]
4868; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
4869; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm4
4870; AVX512BW-NEXT:    vmovdqa32 %zmm7, %zmm4 {%k1}
4871; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0]
4872; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm4, %zmm2
4873; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63]
4874; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm2, %zmm1
4875; AVX512BW-NEXT:    vmovdqa64 %zmm1, 320(%rax)
4876; AVX512BW-NEXT:    vmovdqa64 %zmm11, 256(%rax)
4877; AVX512BW-NEXT:    vmovdqa64 %zmm10, 192(%rax)
4878; AVX512BW-NEXT:    vmovdqa64 %zmm9, 128(%rax)
4879; AVX512BW-NEXT:    vmovdqa64 %zmm8, 64(%rax)
4880; AVX512BW-NEXT:    vmovdqa64 %zmm6, (%rax)
4881; AVX512BW-NEXT:    vzeroupper
4882; AVX512BW-NEXT:    retq
4883;
4884; AVX512BW-FCP-LABEL: store_i16_stride6_vf32:
4885; AVX512BW-FCP:       # %bb.0:
4886; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4887; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
4888; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %zmm3
4889; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
4890; AVX512BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm5
4891; AVX512BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm1
4892; AVX512BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm0
4893; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34]
4894; AVX512BW-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
4895; AVX512BW-FCP-NEXT:    vpermi2w %zmm5, %zmm4, %zmm6
4896; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37]
4897; AVX512BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
4898; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm7
4899; AVX512BW-FCP-NEXT:    movw $9362, %cx # imm = 0x2492
4900; AVX512BW-FCP-NEXT:    kmovd %ecx, %k2
4901; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm6, %zmm7 {%k2}
4902; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31]
4903; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm7, %zmm8
4904; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31]
4905; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm8, %zmm6
4906; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0]
4907; AVX512BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
4908; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm7
4909; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42]
4910; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
4911; AVX512BW-FCP-NEXT:    vpermi2w %zmm5, %zmm4, %zmm8
4912; AVX512BW-FCP-NEXT:    movw $18724, %cx # imm = 0x4924
4913; AVX512BW-FCP-NEXT:    kmovd %ecx, %k1
4914; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm7, %zmm8 {%k1}
4915; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31]
4916; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm8, %zmm7
4917; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31]
4918; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm7, %zmm8
4919; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0]
4920; AVX512BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
4921; AVX512BW-FCP-NEXT:    vpermi2w %zmm5, %zmm4, %zmm7
4922; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45]
4923; AVX512BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
4924; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm9
4925; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm7, %zmm9 {%k1}
4926; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0]
4927; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm9, %zmm7
4928; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47]
4929; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm7, %zmm9
4930; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50]
4931; AVX512BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
4932; AVX512BW-FCP-NEXT:    vpermi2w %zmm5, %zmm4, %zmm7
4933; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53]
4934; AVX512BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
4935; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm10
4936; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm7, %zmm10 {%k2}
4937; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31]
4938; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm10, %zmm7
4939; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31]
4940; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm7, %zmm10
4941; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0]
4942; AVX512BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
4943; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm7
4944; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58]
4945; AVX512BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
4946; AVX512BW-FCP-NEXT:    vpermi2w %zmm5, %zmm4, %zmm11
4947; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm7, %zmm11 {%k1}
4948; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31]
4949; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm11, %zmm7
4950; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm11 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31]
4951; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm7, %zmm11
4952; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0]
4953; AVX512BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
4954; AVX512BW-FCP-NEXT:    vpermi2w %zmm5, %zmm4, %zmm7
4955; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61]
4956; AVX512BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
4957; AVX512BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm4
4958; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm7, %zmm4 {%k1}
4959; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0]
4960; AVX512BW-FCP-NEXT:    vpermi2w %zmm1, %zmm4, %zmm2
4961; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63]
4962; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm2, %zmm1
4963; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, 320(%rax)
4964; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, 256(%rax)
4965; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, 192(%rax)
4966; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, 128(%rax)
4967; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, 64(%rax)
4968; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, (%rax)
4969; AVX512BW-FCP-NEXT:    vzeroupper
4970; AVX512BW-FCP-NEXT:    retq
4971;
4972; AVX512DQ-BW-LABEL: store_i16_stride6_vf32:
4973; AVX512DQ-BW:       # %bb.0:
4974; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4975; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm2
4976; AVX512DQ-BW-NEXT:    vmovdqa64 (%rsi), %zmm3
4977; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdx), %zmm4
4978; AVX512DQ-BW-NEXT:    vmovdqa64 (%rcx), %zmm5
4979; AVX512DQ-BW-NEXT:    vmovdqa64 (%r8), %zmm1
4980; AVX512DQ-BW-NEXT:    vmovdqa64 (%r9), %zmm0
4981; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34]
4982; AVX512DQ-BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
4983; AVX512DQ-BW-NEXT:    vpermi2w %zmm5, %zmm4, %zmm6
4984; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37]
4985; AVX512DQ-BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
4986; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm7
4987; AVX512DQ-BW-NEXT:    movw $9362, %cx # imm = 0x2492
4988; AVX512DQ-BW-NEXT:    kmovd %ecx, %k2
4989; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm6, %zmm7 {%k2}
4990; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31]
4991; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm7, %zmm8
4992; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31]
4993; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm8, %zmm6
4994; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0]
4995; AVX512DQ-BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
4996; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm7
4997; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42]
4998; AVX512DQ-BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
4999; AVX512DQ-BW-NEXT:    vpermi2w %zmm5, %zmm4, %zmm8
5000; AVX512DQ-BW-NEXT:    movw $18724, %cx # imm = 0x4924
5001; AVX512DQ-BW-NEXT:    kmovd %ecx, %k1
5002; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm7, %zmm8 {%k1}
5003; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31]
5004; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm8, %zmm7
5005; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31]
5006; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm7, %zmm8
5007; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0]
5008; AVX512DQ-BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
5009; AVX512DQ-BW-NEXT:    vpermi2w %zmm5, %zmm4, %zmm7
5010; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45]
5011; AVX512DQ-BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
5012; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm9
5013; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm7, %zmm9 {%k1}
5014; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0]
5015; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm9, %zmm7
5016; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47]
5017; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm7, %zmm9
5018; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50]
5019; AVX512DQ-BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
5020; AVX512DQ-BW-NEXT:    vpermi2w %zmm5, %zmm4, %zmm7
5021; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53]
5022; AVX512DQ-BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
5023; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm10
5024; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm7, %zmm10 {%k2}
5025; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31]
5026; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm10, %zmm7
5027; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31]
5028; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm7, %zmm10
5029; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0]
5030; AVX512DQ-BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
5031; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm7
5032; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58]
5033; AVX512DQ-BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
5034; AVX512DQ-BW-NEXT:    vpermi2w %zmm5, %zmm4, %zmm11
5035; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm7, %zmm11 {%k1}
5036; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31]
5037; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm11, %zmm7
5038; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm11 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31]
5039; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm7, %zmm11
5040; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0]
5041; AVX512DQ-BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
5042; AVX512DQ-BW-NEXT:    vpermi2w %zmm5, %zmm4, %zmm7
5043; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61]
5044; AVX512DQ-BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
5045; AVX512DQ-BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm4
5046; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm7, %zmm4 {%k1}
5047; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0]
5048; AVX512DQ-BW-NEXT:    vpermi2w %zmm1, %zmm4, %zmm2
5049; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63]
5050; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm2, %zmm1
5051; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, 320(%rax)
5052; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, 256(%rax)
5053; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, 192(%rax)
5054; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, 128(%rax)
5055; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, 64(%rax)
5056; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, (%rax)
5057; AVX512DQ-BW-NEXT:    vzeroupper
5058; AVX512DQ-BW-NEXT:    retq
5059;
5060; AVX512DQ-BW-FCP-LABEL: store_i16_stride6_vf32:
5061; AVX512DQ-BW-FCP:       # %bb.0:
5062; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5063; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm2
5064; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rsi), %zmm3
5065; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
5066; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm5
5067; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm1
5068; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm0
5069; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34]
5070; AVX512DQ-BW-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
5071; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm5, %zmm4, %zmm6
5072; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37]
5073; AVX512DQ-BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
5074; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm7
5075; AVX512DQ-BW-FCP-NEXT:    movw $9362, %cx # imm = 0x2492
5076; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k2
5077; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm6, %zmm7 {%k2}
5078; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31]
5079; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm7, %zmm8
5080; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31]
5081; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm8, %zmm6
5082; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0]
5083; AVX512DQ-BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
5084; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm7
5085; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42]
5086; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
5087; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm5, %zmm4, %zmm8
5088; AVX512DQ-BW-FCP-NEXT:    movw $18724, %cx # imm = 0x4924
5089; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k1
5090; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm7, %zmm8 {%k1}
5091; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31]
5092; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm8, %zmm7
5093; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31]
5094; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm7, %zmm8
5095; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0]
5096; AVX512DQ-BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
5097; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm5, %zmm4, %zmm7
5098; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45]
5099; AVX512DQ-BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
5100; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm9
5101; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm7, %zmm9 {%k1}
5102; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0]
5103; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm9, %zmm7
5104; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47]
5105; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm7, %zmm9
5106; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50]
5107; AVX512DQ-BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
5108; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm5, %zmm4, %zmm7
5109; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53]
5110; AVX512DQ-BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
5111; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm10
5112; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm7, %zmm10 {%k2}
5113; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31]
5114; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm10, %zmm7
5115; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31]
5116; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm7, %zmm10
5117; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0]
5118; AVX512DQ-BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
5119; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm7
5120; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58]
5121; AVX512DQ-BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
5122; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm5, %zmm4, %zmm11
5123; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm7, %zmm11 {%k1}
5124; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31]
5125; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm11, %zmm7
5126; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm11 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31]
5127; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm7, %zmm11
5128; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0]
5129; AVX512DQ-BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
5130; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm5, %zmm4, %zmm7
5131; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61]
5132; AVX512DQ-BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
5133; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm3, %zmm2, %zmm4
5134; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm7, %zmm4 {%k1}
5135; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0]
5136; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm1, %zmm4, %zmm2
5137; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63]
5138; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm2, %zmm1
5139; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, 320(%rax)
5140; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, 256(%rax)
5141; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, 192(%rax)
5142; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, 128(%rax)
5143; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, 64(%rax)
5144; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, (%rax)
5145; AVX512DQ-BW-FCP-NEXT:    vzeroupper
5146; AVX512DQ-BW-FCP-NEXT:    retq
5147  %in.vec0 = load <32 x i16>, ptr %in.vecptr0, align 64
5148  %in.vec1 = load <32 x i16>, ptr %in.vecptr1, align 64
5149  %in.vec2 = load <32 x i16>, ptr %in.vecptr2, align 64
5150  %in.vec3 = load <32 x i16>, ptr %in.vecptr3, align 64
5151  %in.vec4 = load <32 x i16>, ptr %in.vecptr4, align 64
5152  %in.vec5 = load <32 x i16>, ptr %in.vecptr5, align 64
5153  %1 = shufflevector <32 x i16> %in.vec0, <32 x i16> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
5154  %2 = shufflevector <32 x i16> %in.vec2, <32 x i16> %in.vec3, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
5155  %3 = shufflevector <32 x i16> %in.vec4, <32 x i16> %in.vec5, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
5156  %4 = shufflevector <64 x i16> %1, <64 x i16> %2, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
5157  %5 = shufflevector <64 x i16> %3, <64 x i16> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5158  %6 = shufflevector <128 x i16> %4, <128 x i16> %5, <192 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191>
5159  %interleaved.vec = shufflevector <192 x i16> %6, <192 x i16> poison, <192 x i32> <i32 0, i32 32, i32 64, i32 96, i32 128, i32 160, i32 1, i32 33, i32 65, i32 97, i32 129, i32 161, i32 2, i32 34, i32 66, i32 98, i32 130, i32 162, i32 3, i32 35, i32 67, i32 99, i32 131, i32 163, i32 4, i32 36, i32 68, i32 100, i32 132, i32 164, i32 5, i32 37, i32 69, i32 101, i32 133, i32 165, i32 6, i32 38, i32 70, i32 102, i32 134, i32 166, i32 7, i32 39, i32 71, i32 103, i32 135, i32 167, i32 8, i32 40, i32 72, i32 104, i32 136, i32 168, i32 9, i32 41, i32 73, i32 105, i32 137, i32 169, i32 10, i32 42, i32 74, i32 106, i32 138, i32 170, i32 11, i32 43, i32 75, i32 107, i32 139, i32 171, i32 12, i32 44, i32 76, i32 108, i32 140, i32 172, i32 13, i32 45, i32 77, i32 109, i32 141, i32 173, i32 14, i32 46, i32 78, i32 110, i32 142, i32 174, i32 15, i32 47, i32 79, i32 111, i32 143, i32 175, i32 16, i32 48, i32 80, i32 112, i32 144, i32 176, i32 17, i32 49, i32 81, i32 113, i32 145, i32 177, i32 18, i32 50, i32 82, i32 114, i32 146, i32 178, i32 19, i32 51, i32 83, i32 115, i32 147, i32 179, i32 20, i32 52, i32 84, i32 116, i32 148, i32 180, i32 21, i32 53, i32 85, i32 117, i32 149, i32 181, i32 22, i32 54, i32 86, i32 118, i32 150, i32 182, i32 23, i32 55, i32 87, i32 119, i32 151, i32 183, i32 24, i32 56, i32 88, i32 120, i32 152, i32 184, i32 25, i32 57, i32 89, i32 121, i32 153, i32 185, i32 26, i32 58, i32 90, i32 122, i32 154, i32 186, i32 27, i32 59, i32 91, i32 123, i32 155, i32 187, i32 28, i32 60, i32 92, i32 124, i32 156, i32 188, i32 29, i32 61, i32 93, i32 125, i32 157, i32 189, i32 30, i32 62, i32 94, i32 126, i32 158, i32 190, i32 31, i32 63, i32 95, i32 127, i32 159, i32 191>
5160  store <192 x i16> %interleaved.vec, ptr %out.vec, align 64
5161  ret void
5162}
5163
5164define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind {
5165; SSE-LABEL: store_i16_stride6_vf64:
5166; SSE:       # %bb.0:
5167; SSE-NEXT:    subq $808, %rsp # imm = 0x328
5168; SSE-NEXT:    movdqa (%rdi), %xmm10
5169; SSE-NEXT:    movdqa 16(%rdi), %xmm11
5170; SSE-NEXT:    movdqa (%rsi), %xmm4
5171; SSE-NEXT:    movdqa 16(%rsi), %xmm1
5172; SSE-NEXT:    movdqa (%rdx), %xmm12
5173; SSE-NEXT:    movdqa 16(%rdx), %xmm2
5174; SSE-NEXT:    movdqa (%rcx), %xmm6
5175; SSE-NEXT:    movdqa 16(%rcx), %xmm3
5176; SSE-NEXT:    movdqa (%r8), %xmm9
5177; SSE-NEXT:    movdqa (%r9), %xmm8
5178; SSE-NEXT:    movdqa %xmm12, %xmm0
5179; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
5180; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5181; SSE-NEXT:    movdqa %xmm10, %xmm7
5182; SSE-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
5183; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5184; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[3,3]
5185; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm9[2,1,3,3,4,5,6,7]
5186; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,2],xmm5[0,1]
5187; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,0,1,3]
5188; SSE-NEXT:    movaps {{.*#+}} xmm14 = [65535,0,65535,65535,65535,65535,65535,0]
5189; SSE-NEXT:    andps %xmm14, %xmm7
5190; SSE-NEXT:    movdqa %xmm8, %xmm5
5191; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5192; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
5193; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1]
5194; SSE-NEXT:    movaps %xmm14, %xmm0
5195; SSE-NEXT:    andnps %xmm8, %xmm0
5196; SSE-NEXT:    orps %xmm7, %xmm0
5197; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5198; SSE-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
5199; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5200; SSE-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
5201; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5202; SSE-NEXT:    movdqa %xmm10, %xmm4
5203; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,3],xmm12[3,3]
5204; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,6,5,7,7]
5205; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,2],xmm6[2,3]
5206; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0,1,3]
5207; SSE-NEXT:    andps %xmm14, %xmm4
5208; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,6,6,7]
5209; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
5210; SSE-NEXT:    movaps %xmm14, %xmm0
5211; SSE-NEXT:    andnps %xmm6, %xmm0
5212; SSE-NEXT:    orps %xmm4, %xmm0
5213; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5214; SSE-NEXT:    movdqa %xmm2, %xmm0
5215; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
5216; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5217; SSE-NEXT:    movdqa %xmm11, %xmm4
5218; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
5219; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5220; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,3],xmm0[3,3]
5221; SSE-NEXT:    movdqa 16(%r8), %xmm5
5222; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm5[2,1,3,3,4,5,6,7]
5223; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5224; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,2],xmm6[0,1]
5225; SSE-NEXT:    movdqa 16(%r9), %xmm7
5226; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm7[0,2,2,3,4,5,6,7]
5227; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5228; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
5229; SSE-NEXT:    movaps %xmm14, %xmm0
5230; SSE-NEXT:    andnps %xmm6, %xmm0
5231; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0,1,3]
5232; SSE-NEXT:    andps %xmm14, %xmm4
5233; SSE-NEXT:    orps %xmm4, %xmm0
5234; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5235; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
5236; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5237; SSE-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7]
5238; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5239; SSE-NEXT:    movdqa %xmm11, %xmm1
5240; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[3,3]
5241; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,6,5,7,7]
5242; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,2],xmm3[2,3]
5243; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,6,6,7]
5244; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
5245; SSE-NEXT:    movaps %xmm14, %xmm0
5246; SSE-NEXT:    andnps %xmm3, %xmm0
5247; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
5248; SSE-NEXT:    andps %xmm14, %xmm1
5249; SSE-NEXT:    orps %xmm1, %xmm0
5250; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5251; SSE-NEXT:    movdqa 32(%rdx), %xmm2
5252; SSE-NEXT:    movdqa 32(%rcx), %xmm1
5253; SSE-NEXT:    movdqa %xmm2, %xmm0
5254; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
5255; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5256; SSE-NEXT:    movdqa 32(%rdi), %xmm3
5257; SSE-NEXT:    movdqa 32(%rsi), %xmm6
5258; SSE-NEXT:    movdqa %xmm3, %xmm7
5259; SSE-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
5260; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5261; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[3,3]
5262; SSE-NEXT:    movdqa 32(%r8), %xmm4
5263; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm4[2,1,3,3,4,5,6,7]
5264; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5265; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,2],xmm8[0,1]
5266; SSE-NEXT:    movdqa 32(%r9), %xmm5
5267; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm5[0,2,2,3,4,5,6,7]
5268; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5269; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1]
5270; SSE-NEXT:    movaps %xmm14, %xmm0
5271; SSE-NEXT:    andnps %xmm8, %xmm0
5272; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,0,1,3]
5273; SSE-NEXT:    andps %xmm14, %xmm7
5274; SSE-NEXT:    orps %xmm7, %xmm0
5275; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5276; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
5277; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5278; SSE-NEXT:    movdqa %xmm3, %xmm1
5279; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
5280; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5281; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[3,3]
5282; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,5,7,7]
5283; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,2],xmm6[2,3]
5284; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,6,6,7]
5285; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
5286; SSE-NEXT:    movaps %xmm14, %xmm0
5287; SSE-NEXT:    andnps %xmm6, %xmm0
5288; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
5289; SSE-NEXT:    andps %xmm14, %xmm1
5290; SSE-NEXT:    orps %xmm1, %xmm0
5291; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5292; SSE-NEXT:    movdqa 48(%rdx), %xmm2
5293; SSE-NEXT:    movdqa 48(%rcx), %xmm1
5294; SSE-NEXT:    movdqa %xmm2, %xmm0
5295; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
5296; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5297; SSE-NEXT:    movdqa 48(%rdi), %xmm3
5298; SSE-NEXT:    movdqa 48(%rsi), %xmm7
5299; SSE-NEXT:    movdqa %xmm3, %xmm8
5300; SSE-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
5301; SSE-NEXT:    movdqa %xmm8, (%rsp) # 16-byte Spill
5302; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[3,3]
5303; SSE-NEXT:    movdqa 48(%r8), %xmm6
5304; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm6[2,1,3,3,4,5,6,7]
5305; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[1,2],xmm11[0,1]
5306; SSE-NEXT:    movdqa 48(%r9), %xmm4
5307; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm4[0,2,2,3,4,5,6,7]
5308; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5309; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1]
5310; SSE-NEXT:    movaps %xmm14, %xmm0
5311; SSE-NEXT:    andnps %xmm11, %xmm0
5312; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0,1,3]
5313; SSE-NEXT:    andps %xmm14, %xmm8
5314; SSE-NEXT:    orps %xmm8, %xmm0
5315; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5316; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
5317; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5318; SSE-NEXT:    movdqa %xmm3, %xmm1
5319; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
5320; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5321; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[3,3]
5322; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,6,5,7,7]
5323; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,2],xmm7[2,3]
5324; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,6,6,7]
5325; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
5326; SSE-NEXT:    movaps %xmm14, %xmm0
5327; SSE-NEXT:    andnps %xmm7, %xmm0
5328; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
5329; SSE-NEXT:    andps %xmm14, %xmm1
5330; SSE-NEXT:    orps %xmm1, %xmm0
5331; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5332; SSE-NEXT:    movdqa 64(%rdx), %xmm2
5333; SSE-NEXT:    movdqa 64(%rcx), %xmm1
5334; SSE-NEXT:    movdqa %xmm2, %xmm0
5335; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
5336; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5337; SSE-NEXT:    movdqa 64(%rdi), %xmm3
5338; SSE-NEXT:    movdqa 64(%rsi), %xmm8
5339; SSE-NEXT:    movdqa %xmm3, %xmm11
5340; SSE-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3]
5341; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5342; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[2,3],xmm0[3,3]
5343; SSE-NEXT:    movdqa 64(%r8), %xmm7
5344; SSE-NEXT:    pshuflw {{.*#+}} xmm12 = xmm7[2,1,3,3,4,5,6,7]
5345; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[1,2],xmm12[0,1]
5346; SSE-NEXT:    movdqa 64(%r9), %xmm4
5347; SSE-NEXT:    pshuflw {{.*#+}} xmm12 = xmm4[0,2,2,3,4,5,6,7]
5348; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5349; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[0,1,2,1]
5350; SSE-NEXT:    movaps %xmm14, %xmm0
5351; SSE-NEXT:    andnps %xmm12, %xmm0
5352; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[2,0,1,3]
5353; SSE-NEXT:    andps %xmm14, %xmm11
5354; SSE-NEXT:    orps %xmm11, %xmm0
5355; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5356; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
5357; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5358; SSE-NEXT:    movdqa %xmm3, %xmm1
5359; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
5360; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5361; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[3,3]
5362; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,5,7,7]
5363; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,2],xmm8[2,3]
5364; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm4[0,1,2,3,4,6,6,7]
5365; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
5366; SSE-NEXT:    movaps %xmm14, %xmm0
5367; SSE-NEXT:    andnps %xmm8, %xmm0
5368; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
5369; SSE-NEXT:    andps %xmm14, %xmm1
5370; SSE-NEXT:    orps %xmm1, %xmm0
5371; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5372; SSE-NEXT:    movdqa 80(%rdx), %xmm2
5373; SSE-NEXT:    movdqa 80(%rcx), %xmm1
5374; SSE-NEXT:    movdqa %xmm2, %xmm0
5375; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
5376; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5377; SSE-NEXT:    movdqa 80(%rdi), %xmm3
5378; SSE-NEXT:    movdqa 80(%rsi), %xmm11
5379; SSE-NEXT:    movdqa %xmm3, %xmm12
5380; SSE-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
5381; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5382; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[2,3],xmm0[3,3]
5383; SSE-NEXT:    movdqa 80(%r8), %xmm8
5384; SSE-NEXT:    pshuflw {{.*#+}} xmm15 = xmm8[2,1,3,3,4,5,6,7]
5385; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[1,2],xmm15[0,1]
5386; SSE-NEXT:    movdqa 80(%r9), %xmm4
5387; SSE-NEXT:    pshuflw {{.*#+}} xmm15 = xmm4[0,2,2,3,4,5,6,7]
5388; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5389; SSE-NEXT:    pshufd {{.*#+}} xmm15 = xmm15[0,1,2,1]
5390; SSE-NEXT:    movaps %xmm14, %xmm0
5391; SSE-NEXT:    andnps %xmm15, %xmm0
5392; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[2,0,1,3]
5393; SSE-NEXT:    andps %xmm14, %xmm12
5394; SSE-NEXT:    orps %xmm12, %xmm0
5395; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5396; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
5397; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5398; SSE-NEXT:    movdqa %xmm3, %xmm1
5399; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
5400; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5401; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[3,3]
5402; SSE-NEXT:    pshufhw {{.*#+}} xmm11 = xmm8[0,1,2,3,6,5,7,7]
5403; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,2],xmm11[2,3]
5404; SSE-NEXT:    pshufhw {{.*#+}} xmm11 = xmm4[0,1,2,3,4,6,6,7]
5405; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[2,1,2,3]
5406; SSE-NEXT:    movaps %xmm14, %xmm0
5407; SSE-NEXT:    andnps %xmm11, %xmm0
5408; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
5409; SSE-NEXT:    andps %xmm14, %xmm1
5410; SSE-NEXT:    orps %xmm1, %xmm0
5411; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5412; SSE-NEXT:    movdqa 96(%rdx), %xmm2
5413; SSE-NEXT:    movdqa 96(%rcx), %xmm1
5414; SSE-NEXT:    movdqa %xmm2, %xmm0
5415; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
5416; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5417; SSE-NEXT:    movdqa 96(%rdi), %xmm3
5418; SSE-NEXT:    movdqa 96(%rsi), %xmm12
5419; SSE-NEXT:    movdqa %xmm3, %xmm15
5420; SSE-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
5421; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5422; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[2,3],xmm0[3,3]
5423; SSE-NEXT:    movdqa 96(%r8), %xmm11
5424; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm11[2,1,3,3,4,5,6,7]
5425; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[1,2],xmm0[0,1]
5426; SSE-NEXT:    movdqa 96(%r9), %xmm10
5427; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm10[0,2,2,3,4,5,6,7]
5428; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5429; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
5430; SSE-NEXT:    movaps %xmm14, %xmm13
5431; SSE-NEXT:    andnps %xmm0, %xmm13
5432; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[2,0,1,3]
5433; SSE-NEXT:    andps %xmm14, %xmm15
5434; SSE-NEXT:    orps %xmm15, %xmm13
5435; SSE-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5436; SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
5437; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5438; SSE-NEXT:    movdqa %xmm3, %xmm0
5439; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
5440; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5441; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3],xmm2[3,3]
5442; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,6,5,7,7]
5443; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,3]
5444; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,4,6,6,7]
5445; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
5446; SSE-NEXT:    movaps %xmm14, %xmm12
5447; SSE-NEXT:    andnps %xmm1, %xmm12
5448; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
5449; SSE-NEXT:    andps %xmm14, %xmm0
5450; SSE-NEXT:    orps %xmm0, %xmm12
5451; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5452; SSE-NEXT:    movdqa 112(%rdx), %xmm4
5453; SSE-NEXT:    movdqa 112(%rcx), %xmm5
5454; SSE-NEXT:    movdqa %xmm4, %xmm2
5455; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
5456; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5457; SSE-NEXT:    movdqa 112(%rdi), %xmm0
5458; SSE-NEXT:    movdqa 112(%rsi), %xmm1
5459; SSE-NEXT:    movdqa %xmm0, %xmm15
5460; SSE-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
5461; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5462; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[2,3],xmm2[3,3]
5463; SSE-NEXT:    movdqa 112(%r8), %xmm3
5464; SSE-NEXT:    pshuflw {{.*#+}} xmm13 = xmm3[2,1,3,3,4,5,6,7]
5465; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5466; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[1,2],xmm13[0,1]
5467; SSE-NEXT:    movdqa 112(%r9), %xmm2
5468; SSE-NEXT:    pshuflw {{.*#+}} xmm13 = xmm2[0,2,2,3,4,5,6,7]
5469; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1]
5470; SSE-NEXT:    movaps %xmm14, %xmm12
5471; SSE-NEXT:    andnps %xmm13, %xmm12
5472; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[2,0,1,3]
5473; SSE-NEXT:    andps %xmm14, %xmm15
5474; SSE-NEXT:    orps %xmm15, %xmm12
5475; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5476; SSE-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
5477; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5478; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
5479; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5480; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3],xmm4[3,3]
5481; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,7,7]
5482; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,3]
5483; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
5484; SSE-NEXT:    andps %xmm14, %xmm0
5485; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7]
5486; SSE-NEXT:    movdqa %xmm2, %xmm15
5487; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
5488; SSE-NEXT:    andnps %xmm1, %xmm14
5489; SSE-NEXT:    orps %xmm0, %xmm14
5490; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5491; SSE-NEXT:    movaps %xmm4, %xmm0
5492; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5493; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
5494; SSE-NEXT:    movdqa %xmm9, %xmm1
5495; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[1,3]
5496; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2]
5497; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5498; SSE-NEXT:    movdqa %xmm2, %xmm13
5499; SSE-NEXT:    pslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm13[0,1,2,3,4,5]
5500; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,65535,65535]
5501; SSE-NEXT:    movdqa %xmm12, %xmm1
5502; SSE-NEXT:    pandn %xmm13, %xmm1
5503; SSE-NEXT:    andps %xmm12, %xmm0
5504; SSE-NEXT:    por %xmm0, %xmm1
5505; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5506; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1]
5507; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm9[1,1,1,1,4,5,6,7]
5508; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1]
5509; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2]
5510; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535]
5511; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm2[0,0,1,1]
5512; SSE-NEXT:    movdqa %xmm10, %xmm1
5513; SSE-NEXT:    pandn %xmm13, %xmm1
5514; SSE-NEXT:    andps %xmm10, %xmm0
5515; SSE-NEXT:    por %xmm0, %xmm1
5516; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5517; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5518; SSE-NEXT:    movaps %xmm4, %xmm0
5519; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5520; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
5521; SSE-NEXT:    movdqa %xmm9, %xmm13
5522; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[2,1],xmm3[1,3]
5523; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[0,2]
5524; SSE-NEXT:    movdqa %xmm2, %xmm1
5525; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm2[2,2,3,3]
5526; SSE-NEXT:    pslld $16, %xmm1
5527; SSE-NEXT:    movdqa %xmm1, %xmm2
5528; SSE-NEXT:    movdqa %xmm12, %xmm1
5529; SSE-NEXT:    pandn %xmm2, %xmm1
5530; SSE-NEXT:    andps %xmm12, %xmm0
5531; SSE-NEXT:    por %xmm0, %xmm1
5532; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5533; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1]
5534; SSE-NEXT:    psrldq {{.*#+}} xmm9 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
5535; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[1,1],xmm4[1,1]
5536; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[2,0],xmm3[0,2]
5537; SSE-NEXT:    movdqa %xmm10, %xmm0
5538; SSE-NEXT:    pandn %xmm13, %xmm0
5539; SSE-NEXT:    andps %xmm10, %xmm9
5540; SSE-NEXT:    por %xmm9, %xmm0
5541; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5542; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5543; SSE-NEXT:    movaps %xmm9, %xmm0
5544; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5545; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0]
5546; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5547; SSE-NEXT:    movaps %xmm3, %xmm2
5548; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm5[1,3]
5549; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2]
5550; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5551; SSE-NEXT:    movdqa %xmm4, %xmm2
5552; SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
5553; SSE-NEXT:    movdqa %xmm12, %xmm1
5554; SSE-NEXT:    pandn %xmm2, %xmm1
5555; SSE-NEXT:    andps %xmm12, %xmm0
5556; SSE-NEXT:    por %xmm0, %xmm1
5557; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5558; SSE-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm9[1]
5559; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm3[1,1,1,1,4,5,6,7]
5560; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1]
5561; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[0,2]
5562; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,0,1,1]
5563; SSE-NEXT:    movdqa %xmm4, %xmm5
5564; SSE-NEXT:    movdqa %xmm10, %xmm1
5565; SSE-NEXT:    pandn %xmm2, %xmm1
5566; SSE-NEXT:    andps %xmm10, %xmm0
5567; SSE-NEXT:    por %xmm0, %xmm1
5568; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5569; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5570; SSE-NEXT:    movaps %xmm9, %xmm0
5571; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5572; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
5573; SSE-NEXT:    movaps %xmm3, %xmm2
5574; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,1],xmm4[1,3]
5575; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2]
5576; SSE-NEXT:    movdqa %xmm5, %xmm1
5577; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[2,2,3,3]
5578; SSE-NEXT:    pslld $16, %xmm1
5579; SSE-NEXT:    movdqa %xmm12, %xmm5
5580; SSE-NEXT:    pandn %xmm1, %xmm5
5581; SSE-NEXT:    andps %xmm12, %xmm0
5582; SSE-NEXT:    por %xmm0, %xmm5
5583; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5584; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm9[1]
5585; SSE-NEXT:    movdqa %xmm3, %xmm0
5586; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
5587; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1]
5588; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2]
5589; SSE-NEXT:    movdqa %xmm10, %xmm1
5590; SSE-NEXT:    pandn %xmm2, %xmm1
5591; SSE-NEXT:    andps %xmm10, %xmm0
5592; SSE-NEXT:    por %xmm0, %xmm1
5593; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5594; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5595; SSE-NEXT:    movaps %xmm5, %xmm0
5596; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5597; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
5598; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5599; SSE-NEXT:    movaps %xmm1, %xmm2
5600; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[1,3]
5601; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2]
5602; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5603; SSE-NEXT:    movdqa %xmm3, %xmm2
5604; SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
5605; SSE-NEXT:    movdqa %xmm12, %xmm9
5606; SSE-NEXT:    pandn %xmm2, %xmm9
5607; SSE-NEXT:    andps %xmm12, %xmm0
5608; SSE-NEXT:    por %xmm0, %xmm9
5609; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5610; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1]
5611; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,1,4,5,6,7]
5612; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1]
5613; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2]
5614; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1]
5615; SSE-NEXT:    movdqa %xmm10, %xmm4
5616; SSE-NEXT:    pandn %xmm2, %xmm4
5617; SSE-NEXT:    andps %xmm10, %xmm0
5618; SSE-NEXT:    por %xmm0, %xmm4
5619; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5620; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5621; SSE-NEXT:    movaps %xmm5, %xmm0
5622; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5623; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
5624; SSE-NEXT:    movaps %xmm1, %xmm2
5625; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,1],xmm4[1,3]
5626; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2]
5627; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3]
5628; SSE-NEXT:    pslld $16, %xmm3
5629; SSE-NEXT:    movdqa %xmm12, %xmm9
5630; SSE-NEXT:    pandn %xmm3, %xmm9
5631; SSE-NEXT:    andps %xmm12, %xmm0
5632; SSE-NEXT:    por %xmm0, %xmm9
5633; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5634; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1]
5635; SSE-NEXT:    movdqa %xmm1, %xmm0
5636; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
5637; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1]
5638; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2]
5639; SSE-NEXT:    movdqa %xmm10, %xmm1
5640; SSE-NEXT:    pandn %xmm2, %xmm1
5641; SSE-NEXT:    andps %xmm10, %xmm0
5642; SSE-NEXT:    por %xmm0, %xmm1
5643; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5644; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5645; SSE-NEXT:    movaps %xmm4, %xmm0
5646; SSE-NEXT:    movaps (%rsp), %xmm3 # 16-byte Reload
5647; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
5648; SSE-NEXT:    movdqa %xmm6, %xmm2
5649; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[1,3]
5650; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2]
5651; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5652; SSE-NEXT:    movdqa %xmm1, %xmm2
5653; SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
5654; SSE-NEXT:    movdqa %xmm12, %xmm5
5655; SSE-NEXT:    pandn %xmm2, %xmm5
5656; SSE-NEXT:    andps %xmm12, %xmm0
5657; SSE-NEXT:    por %xmm0, %xmm5
5658; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5659; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1]
5660; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm6[1,1,1,1,4,5,6,7]
5661; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1]
5662; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2]
5663; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
5664; SSE-NEXT:    movdqa %xmm10, %xmm3
5665; SSE-NEXT:    pandn %xmm2, %xmm3
5666; SSE-NEXT:    andps %xmm10, %xmm0
5667; SSE-NEXT:    por %xmm0, %xmm3
5668; SSE-NEXT:    movdqa %xmm3, (%rsp) # 16-byte Spill
5669; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5670; SSE-NEXT:    movaps %xmm4, %xmm0
5671; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5672; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
5673; SSE-NEXT:    movdqa %xmm6, %xmm2
5674; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,1],xmm3[1,3]
5675; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2]
5676; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
5677; SSE-NEXT:    pslld $16, %xmm1
5678; SSE-NEXT:    movdqa %xmm12, %xmm5
5679; SSE-NEXT:    pandn %xmm1, %xmm5
5680; SSE-NEXT:    andps %xmm12, %xmm0
5681; SSE-NEXT:    por %xmm0, %xmm5
5682; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5683; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1]
5684; SSE-NEXT:    psrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
5685; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[1,1],xmm4[1,1]
5686; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[0,2]
5687; SSE-NEXT:    movdqa %xmm10, %xmm0
5688; SSE-NEXT:    pandn %xmm2, %xmm0
5689; SSE-NEXT:    andps %xmm10, %xmm6
5690; SSE-NEXT:    por %xmm6, %xmm0
5691; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5692; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5693; SSE-NEXT:    movaps %xmm4, %xmm0
5694; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5695; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
5696; SSE-NEXT:    movdqa %xmm7, %xmm2
5697; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[1,3]
5698; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2]
5699; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5700; SSE-NEXT:    movdqa %xmm1, %xmm2
5701; SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
5702; SSE-NEXT:    movdqa %xmm12, %xmm5
5703; SSE-NEXT:    pandn %xmm2, %xmm5
5704; SSE-NEXT:    andps %xmm12, %xmm0
5705; SSE-NEXT:    por %xmm0, %xmm5
5706; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5707; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1]
5708; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm7[1,1,1,1,4,5,6,7]
5709; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1]
5710; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2]
5711; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
5712; SSE-NEXT:    movdqa %xmm10, %xmm3
5713; SSE-NEXT:    pandn %xmm2, %xmm3
5714; SSE-NEXT:    andps %xmm10, %xmm0
5715; SSE-NEXT:    por %xmm0, %xmm3
5716; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5717; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5718; SSE-NEXT:    movaps %xmm4, %xmm0
5719; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5720; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
5721; SSE-NEXT:    movdqa %xmm7, %xmm2
5722; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,1],xmm3[1,3]
5723; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2]
5724; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
5725; SSE-NEXT:    pslld $16, %xmm1
5726; SSE-NEXT:    movdqa %xmm12, %xmm5
5727; SSE-NEXT:    pandn %xmm1, %xmm5
5728; SSE-NEXT:    andps %xmm12, %xmm0
5729; SSE-NEXT:    por %xmm0, %xmm5
5730; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5731; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1]
5732; SSE-NEXT:    psrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
5733; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,1],xmm4[1,1]
5734; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[0,2]
5735; SSE-NEXT:    movdqa %xmm10, %xmm0
5736; SSE-NEXT:    pandn %xmm2, %xmm0
5737; SSE-NEXT:    andps %xmm10, %xmm7
5738; SSE-NEXT:    por %xmm7, %xmm0
5739; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5740; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5741; SSE-NEXT:    movaps %xmm4, %xmm0
5742; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5743; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
5744; SSE-NEXT:    movdqa %xmm8, %xmm2
5745; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[1,3]
5746; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2]
5747; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5748; SSE-NEXT:    movdqa %xmm1, %xmm2
5749; SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
5750; SSE-NEXT:    movdqa %xmm12, %xmm5
5751; SSE-NEXT:    pandn %xmm2, %xmm5
5752; SSE-NEXT:    andps %xmm12, %xmm0
5753; SSE-NEXT:    por %xmm0, %xmm5
5754; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5755; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1]
5756; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm8[1,1,1,1,4,5,6,7]
5757; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1]
5758; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2]
5759; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
5760; SSE-NEXT:    movdqa %xmm10, %xmm3
5761; SSE-NEXT:    pandn %xmm2, %xmm3
5762; SSE-NEXT:    andps %xmm10, %xmm0
5763; SSE-NEXT:    por %xmm0, %xmm3
5764; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5765; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5766; SSE-NEXT:    movaps %xmm4, %xmm0
5767; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5768; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
5769; SSE-NEXT:    movdqa %xmm8, %xmm2
5770; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,1],xmm3[1,3]
5771; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2]
5772; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
5773; SSE-NEXT:    pslld $16, %xmm1
5774; SSE-NEXT:    movdqa %xmm12, %xmm5
5775; SSE-NEXT:    pandn %xmm1, %xmm5
5776; SSE-NEXT:    andps %xmm12, %xmm0
5777; SSE-NEXT:    por %xmm0, %xmm5
5778; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5779; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1]
5780; SSE-NEXT:    psrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
5781; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[1,1],xmm4[1,1]
5782; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm3[0,2]
5783; SSE-NEXT:    movdqa %xmm10, %xmm0
5784; SSE-NEXT:    pandn %xmm2, %xmm0
5785; SSE-NEXT:    andps %xmm10, %xmm8
5786; SSE-NEXT:    por %xmm8, %xmm0
5787; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5788; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5789; SSE-NEXT:    movaps %xmm4, %xmm0
5790; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5791; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
5792; SSE-NEXT:    movdqa %xmm11, %xmm2
5793; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[1,3]
5794; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2]
5795; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5796; SSE-NEXT:    movdqa %xmm1, %xmm2
5797; SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
5798; SSE-NEXT:    movdqa %xmm12, %xmm13
5799; SSE-NEXT:    pandn %xmm2, %xmm13
5800; SSE-NEXT:    andps %xmm12, %xmm0
5801; SSE-NEXT:    por %xmm0, %xmm13
5802; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1]
5803; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm11[1,1,1,1,4,5,6,7]
5804; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1]
5805; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2]
5806; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
5807; SSE-NEXT:    movdqa %xmm10, %xmm9
5808; SSE-NEXT:    pandn %xmm2, %xmm9
5809; SSE-NEXT:    andps %xmm10, %xmm0
5810; SSE-NEXT:    por %xmm0, %xmm9
5811; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
5812; SSE-NEXT:    movaps %xmm7, %xmm0
5813; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5814; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
5815; SSE-NEXT:    movdqa %xmm11, %xmm2
5816; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,1],xmm3[1,3]
5817; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2]
5818; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,2,3,3]
5819; SSE-NEXT:    pslld $16, %xmm1
5820; SSE-NEXT:    movdqa %xmm12, %xmm6
5821; SSE-NEXT:    pandn %xmm1, %xmm6
5822; SSE-NEXT:    andps %xmm12, %xmm0
5823; SSE-NEXT:    por %xmm0, %xmm6
5824; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1]
5825; SSE-NEXT:    psrldq {{.*#+}} xmm11 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
5826; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[1,1],xmm7[1,1]
5827; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[2,0],xmm3[0,2]
5828; SSE-NEXT:    movdqa %xmm10, %xmm4
5829; SSE-NEXT:    pandn %xmm5, %xmm4
5830; SSE-NEXT:    andps %xmm10, %xmm11
5831; SSE-NEXT:    por %xmm11, %xmm4
5832; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
5833; SSE-NEXT:    movaps %xmm7, %xmm0
5834; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5835; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
5836; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5837; SSE-NEXT:    movaps %xmm1, %xmm5
5838; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[1,3]
5839; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[0,2]
5840; SSE-NEXT:    movdqa %xmm15, %xmm5
5841; SSE-NEXT:    pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5]
5842; SSE-NEXT:    movdqa %xmm12, %xmm11
5843; SSE-NEXT:    pandn %xmm5, %xmm11
5844; SSE-NEXT:    andps %xmm12, %xmm0
5845; SSE-NEXT:    por %xmm0, %xmm11
5846; SSE-NEXT:    movaps %xmm7, %xmm0
5847; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1]
5848; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm1[1,1,1,1,4,5,6,7]
5849; SSE-NEXT:    movaps %xmm1, %xmm7
5850; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1]
5851; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[0,2]
5852; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1]
5853; SSE-NEXT:    movdqa %xmm15, %xmm8
5854; SSE-NEXT:    movdqa %xmm10, %xmm15
5855; SSE-NEXT:    pandn %xmm1, %xmm15
5856; SSE-NEXT:    andps %xmm10, %xmm5
5857; SSE-NEXT:    por %xmm5, %xmm15
5858; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5859; SSE-NEXT:    movaps %xmm3, %xmm1
5860; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5861; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
5862; SSE-NEXT:    movaps %xmm7, %xmm5
5863; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,1],xmm2[1,3]
5864; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[0,2]
5865; SSE-NEXT:    andps %xmm12, %xmm1
5866; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm8[2,2,3,3]
5867; SSE-NEXT:    pslld $16, %xmm8
5868; SSE-NEXT:    pandn %xmm8, %xmm12
5869; SSE-NEXT:    por %xmm1, %xmm12
5870; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
5871; SSE-NEXT:    psrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
5872; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,1],xmm3[1,1]
5873; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,0],xmm2[0,2]
5874; SSE-NEXT:    andps %xmm10, %xmm7
5875; SSE-NEXT:    pandn %xmm5, %xmm10
5876; SSE-NEXT:    por %xmm7, %xmm10
5877; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5878; SSE-NEXT:    movdqa %xmm10, 736(%rax)
5879; SSE-NEXT:    movdqa %xmm12, 720(%rax)
5880; SSE-NEXT:    movdqa %xmm15, 688(%rax)
5881; SSE-NEXT:    movdqa %xmm11, 672(%rax)
5882; SSE-NEXT:    movdqa %xmm4, 640(%rax)
5883; SSE-NEXT:    movdqa %xmm6, 624(%rax)
5884; SSE-NEXT:    movdqa %xmm9, 592(%rax)
5885; SSE-NEXT:    movdqa %xmm13, 576(%rax)
5886; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5887; SSE-NEXT:    movaps %xmm0, 544(%rax)
5888; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5889; SSE-NEXT:    movaps %xmm0, 528(%rax)
5890; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5891; SSE-NEXT:    movaps %xmm0, 496(%rax)
5892; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5893; SSE-NEXT:    movaps %xmm0, 480(%rax)
5894; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5895; SSE-NEXT:    movaps %xmm0, 448(%rax)
5896; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5897; SSE-NEXT:    movaps %xmm0, 432(%rax)
5898; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5899; SSE-NEXT:    movaps %xmm0, 400(%rax)
5900; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5901; SSE-NEXT:    movaps %xmm0, 384(%rax)
5902; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5903; SSE-NEXT:    movaps %xmm0, 352(%rax)
5904; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5905; SSE-NEXT:    movaps %xmm0, 336(%rax)
5906; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
5907; SSE-NEXT:    movaps %xmm0, 304(%rax)
5908; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5909; SSE-NEXT:    movaps %xmm0, 288(%rax)
5910; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5911; SSE-NEXT:    movaps %xmm0, 256(%rax)
5912; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5913; SSE-NEXT:    movaps %xmm0, 240(%rax)
5914; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5915; SSE-NEXT:    movaps %xmm0, 208(%rax)
5916; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5917; SSE-NEXT:    movaps %xmm0, 192(%rax)
5918; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5919; SSE-NEXT:    movaps %xmm0, 160(%rax)
5920; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5921; SSE-NEXT:    movaps %xmm0, 144(%rax)
5922; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5923; SSE-NEXT:    movaps %xmm0, 112(%rax)
5924; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5925; SSE-NEXT:    movaps %xmm0, 96(%rax)
5926; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5927; SSE-NEXT:    movaps %xmm0, 64(%rax)
5928; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5929; SSE-NEXT:    movaps %xmm0, 48(%rax)
5930; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5931; SSE-NEXT:    movaps %xmm0, 16(%rax)
5932; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5933; SSE-NEXT:    movaps %xmm0, (%rax)
5934; SSE-NEXT:    movaps %xmm14, 752(%rax)
5935; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5936; SSE-NEXT:    movaps %xmm0, 704(%rax)
5937; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5938; SSE-NEXT:    movaps %xmm0, 656(%rax)
5939; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5940; SSE-NEXT:    movaps %xmm0, 608(%rax)
5941; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5942; SSE-NEXT:    movaps %xmm0, 560(%rax)
5943; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5944; SSE-NEXT:    movaps %xmm0, 512(%rax)
5945; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5946; SSE-NEXT:    movaps %xmm0, 464(%rax)
5947; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5948; SSE-NEXT:    movaps %xmm0, 416(%rax)
5949; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5950; SSE-NEXT:    movaps %xmm0, 368(%rax)
5951; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5952; SSE-NEXT:    movaps %xmm0, 320(%rax)
5953; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5954; SSE-NEXT:    movaps %xmm0, 272(%rax)
5955; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5956; SSE-NEXT:    movaps %xmm0, 224(%rax)
5957; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5958; SSE-NEXT:    movaps %xmm0, 176(%rax)
5959; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5960; SSE-NEXT:    movaps %xmm0, 128(%rax)
5961; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5962; SSE-NEXT:    movaps %xmm0, 80(%rax)
5963; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5964; SSE-NEXT:    movaps %xmm0, 32(%rax)
5965; SSE-NEXT:    addq $808, %rsp # imm = 0x328
5966; SSE-NEXT:    retq
5967;
5968; AVX-LABEL: store_i16_stride6_vf64:
5969; AVX:       # %bb.0:
5970; AVX-NEXT:    subq $504, %rsp # imm = 0x1F8
5971; AVX-NEXT:    vmovdqa 80(%rcx), %xmm1
5972; AVX-NEXT:    vmovdqa 80(%rdx), %xmm2
5973; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
5974; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3]
5975; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
5976; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1]
5977; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm2
5978; AVX-NEXT:    vmovdqa 80(%rsi), %xmm3
5979; AVX-NEXT:    vmovdqa 80(%rdi), %xmm5
5980; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
5981; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
5982; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
5983; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm5[0,1,0,1]
5984; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm6, %ymm3
5985; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
5986; AVX-NEXT:    vmovdqa 80(%r8), %xmm2
5987; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7]
5988; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
5989; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm3[0],xmm6[1,2],xmm3[3]
5990; AVX-NEXT:    vmovdqa 80(%r9), %xmm3
5991; AVX-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm3[0,2,2,3,4,5,6,7]
5992; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1]
5993; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6],xmm8[7]
5994; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5995; AVX-NEXT:    vextractf128 $1, %ymm6, %xmm6
5996; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5],xmm6[6,7]
5997; AVX-NEXT:    vpslld $16, %xmm3, %xmm7
5998; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5],xmm6[6,7]
5999; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6000; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
6001; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
6002; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,2]
6003; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
6004; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
6005; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
6006; AVX-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
6007; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2,3]
6008; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[2,2,3,3]
6009; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7]
6010; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6011; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm4
6012; AVX-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,6,5,7,7]
6013; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
6014; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7]
6015; AVX-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7]
6016; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
6017; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6],xmm5[7]
6018; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6019; AVX-NEXT:    vmovdqa 64(%rsi), %xmm8
6020; AVX-NEXT:    vmovdqa 64(%rdi), %xmm9
6021; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
6022; AVX-NEXT:    vmovdqa 64(%rcx), %xmm10
6023; AVX-NEXT:    vmovdqa 64(%rdx), %xmm11
6024; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
6025; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm7[1,1,2,2]
6026; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm7[2,2,3,3]
6027; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
6028; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3]
6029; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm4, %ymm6
6030; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7]
6031; AVX-NEXT:    vmovdqa 64(%r8), %xmm5
6032; AVX-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
6033; AVX-NEXT:    vblendps {{.*#+}} xmm13 = xmm12[0],xmm6[1],xmm12[2,3]
6034; AVX-NEXT:    vmovdqa 64(%r9), %xmm6
6035; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm6[2,2,3,3]
6036; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3],xmm13[4,5,6,7]
6037; AVX-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6038; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
6039; AVX-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm5[0,1,2,3,6,5,7,7]
6040; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
6041; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5],xmm13[6,7]
6042; AVX-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm6[0,1,2,3,4,6,6,7]
6043; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
6044; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6],xmm13[7]
6045; AVX-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6046; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1]
6047; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
6048; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm12, %ymm0
6049; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm1[0,1,0,1]
6050; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm12, %ymm1
6051; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
6052; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm2[0],xmm0[3]
6053; AVX-NEXT:    vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5]
6054; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm12[5],xmm1[6,7]
6055; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6056; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
6057; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
6058; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
6059; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[0,0,1,1]
6060; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
6061; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6062; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
6063; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
6064; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2]
6065; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
6066; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
6067; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
6068; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm3
6069; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7]
6070; AVX-NEXT:    vinsertps {{.*#+}} xmm3 = xmm1[0,1],xmm5[0],xmm1[3]
6071; AVX-NEXT:    vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5]
6072; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm8[5],xmm3[6,7]
6073; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6074; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
6075; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
6076; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7]
6077; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm6[0,0,1,1]
6078; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7]
6079; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6080; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
6081; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm7[0,0,1,1]
6082; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
6083; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
6084; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[0,1,0,1]
6085; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
6086; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
6087; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm5[2,1,3,3,4,5,6,7]
6088; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
6089; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3]
6090; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm6[0,2,2,3,4,5,6,7]
6091; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
6092; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6],xmm2[7]
6093; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6094; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
6095; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5],xmm0[6,7]
6096; AVX-NEXT:    vpslld $16, %xmm6, %xmm1
6097; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
6098; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6099; AVX-NEXT:    vmovdqa 48(%rcx), %xmm1
6100; AVX-NEXT:    vmovdqa 48(%rdx), %xmm2
6101; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6102; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3]
6103; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
6104; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1]
6105; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm2
6106; AVX-NEXT:    vmovdqa 48(%rsi), %xmm3
6107; AVX-NEXT:    vmovdqa 48(%rdi), %xmm5
6108; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
6109; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
6110; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
6111; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[0,1,0,1]
6112; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm3, %ymm3
6113; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
6114; AVX-NEXT:    vmovdqa 48(%r8), %xmm2
6115; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7]
6116; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
6117; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm3[0],xmm6[1,2],xmm3[3]
6118; AVX-NEXT:    vmovdqa 48(%r9), %xmm3
6119; AVX-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm3[0,2,2,3,4,5,6,7]
6120; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1]
6121; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6],xmm8[7]
6122; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6123; AVX-NEXT:    vextractf128 $1, %ymm6, %xmm6
6124; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5],xmm6[6,7]
6125; AVX-NEXT:    vpslld $16, %xmm3, %xmm7
6126; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5],xmm6[6,7]
6127; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6128; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
6129; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
6130; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,2]
6131; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
6132; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
6133; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
6134; AVX-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
6135; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2,3]
6136; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[2,2,3,3]
6137; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7]
6138; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6139; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm4
6140; AVX-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,6,5,7,7]
6141; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
6142; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7]
6143; AVX-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7]
6144; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
6145; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6],xmm5[7]
6146; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6147; AVX-NEXT:    vmovdqa 32(%rcx), %xmm8
6148; AVX-NEXT:    vmovdqa 32(%rdx), %xmm9
6149; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
6150; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,2]
6151; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm6[2,2,3,3]
6152; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
6153; AVX-NEXT:    vmovdqa 32(%rsi), %xmm10
6154; AVX-NEXT:    vmovdqa 32(%rdi), %xmm11
6155; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
6156; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3]
6157; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm7, %ymm5
6158; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
6159; AVX-NEXT:    vmovdqa 32(%r8), %xmm4
6160; AVX-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
6161; AVX-NEXT:    vblendps {{.*#+}} xmm13 = xmm12[0],xmm5[1],xmm12[2,3]
6162; AVX-NEXT:    vmovdqa 32(%r9), %xmm5
6163; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm5[2,2,3,3]
6164; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3],xmm13[4,5,6,7]
6165; AVX-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6166; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
6167; AVX-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm4[0,1,2,3,6,5,7,7]
6168; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
6169; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5],xmm13[6,7]
6170; AVX-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm5[0,1,2,3,4,6,6,7]
6171; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
6172; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6],xmm13[7]
6173; AVX-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6174; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1]
6175; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
6176; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm12, %ymm0
6177; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm1[0,1,0,1]
6178; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm12, %ymm1
6179; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
6180; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm2[0],xmm0[3]
6181; AVX-NEXT:    vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5]
6182; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm12[5],xmm1[6,7]
6183; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6184; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
6185; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
6186; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
6187; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[0,0,1,1]
6188; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
6189; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6190; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
6191; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
6192; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2]
6193; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
6194; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
6195; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
6196; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm3
6197; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7]
6198; AVX-NEXT:    vinsertps {{.*#+}} xmm3 = xmm1[0,1],xmm4[0],xmm1[3]
6199; AVX-NEXT:    vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5]
6200; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm8[5],xmm3[6,7]
6201; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6202; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
6203; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
6204; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7]
6205; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm5[0,0,1,1]
6206; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7]
6207; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6208; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
6209; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm6[0,0,1,1]
6210; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
6211; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
6212; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm7[0,1,0,1]
6213; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
6214; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
6215; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[2,1,3,3,4,5,6,7]
6216; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
6217; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3]
6218; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7]
6219; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
6220; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6],xmm2[7]
6221; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6222; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
6223; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5],xmm0[6,7]
6224; AVX-NEXT:    vpslld $16, %xmm5, %xmm1
6225; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
6226; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6227; AVX-NEXT:    vmovdqa 112(%rcx), %xmm1
6228; AVX-NEXT:    vmovdqa 112(%rdx), %xmm2
6229; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6230; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
6231; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
6232; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[0,0,1,1]
6233; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
6234; AVX-NEXT:    vmovdqa 112(%rsi), %xmm3
6235; AVX-NEXT:    vmovdqa 112(%rdi), %xmm5
6236; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
6237; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
6238; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
6239; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[0,1,0,1]
6240; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm3, %ymm3
6241; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
6242; AVX-NEXT:    vmovdqa 112(%r8), %xmm2
6243; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7]
6244; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
6245; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm3[0],xmm6[1,2],xmm3[3]
6246; AVX-NEXT:    vmovdqa 112(%r9), %xmm3
6247; AVX-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm3[0,2,2,3,4,5,6,7]
6248; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1]
6249; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6],xmm8[7]
6250; AVX-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6251; AVX-NEXT:    vextractf128 $1, %ymm6, %xmm6
6252; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5],xmm6[6,7]
6253; AVX-NEXT:    vpslld $16, %xmm3, %xmm7
6254; AVX-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5],xmm6[6,7]
6255; AVX-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6256; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
6257; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
6258; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,2]
6259; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
6260; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
6261; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
6262; AVX-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
6263; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2,3]
6264; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[2,2,3,3]
6265; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7]
6266; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6267; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm4
6268; AVX-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,6,5,7,7]
6269; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
6270; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7]
6271; AVX-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7]
6272; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
6273; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6],xmm5[7]
6274; AVX-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6275; AVX-NEXT:    vmovdqa 96(%rcx), %xmm8
6276; AVX-NEXT:    vmovdqa 96(%rdx), %xmm9
6277; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
6278; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,2]
6279; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm6[2,2,3,3]
6280; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
6281; AVX-NEXT:    vmovdqa 96(%rsi), %xmm10
6282; AVX-NEXT:    vmovdqa 96(%rdi), %xmm11
6283; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
6284; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3]
6285; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm7, %ymm5
6286; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
6287; AVX-NEXT:    vmovdqa 96(%r8), %xmm4
6288; AVX-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
6289; AVX-NEXT:    vblendps {{.*#+}} xmm13 = xmm12[0],xmm5[1],xmm12[2,3]
6290; AVX-NEXT:    vmovdqa 96(%r9), %xmm5
6291; AVX-NEXT:    vpshufd {{.*#+}} xmm14 = xmm5[2,2,3,3]
6292; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3],xmm13[4,5,6,7]
6293; AVX-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6294; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
6295; AVX-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm4[0,1,2,3,6,5,7,7]
6296; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
6297; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5],xmm13[6,7]
6298; AVX-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm5[0,1,2,3,4,6,6,7]
6299; AVX-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
6300; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6],xmm13[7]
6301; AVX-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6302; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1]
6303; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
6304; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm12, %ymm0
6305; AVX-NEXT:    vpshufd {{.*#+}} xmm12 = xmm1[0,1,0,1]
6306; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm12, %ymm1
6307; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
6308; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm2[0],xmm0[3]
6309; AVX-NEXT:    vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5]
6310; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm12[5],xmm1[6,7]
6311; AVX-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
6312; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
6313; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
6314; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
6315; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[0,0,1,1]
6316; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
6317; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6318; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
6319; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
6320; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2]
6321; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
6322; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
6323; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
6324; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm3
6325; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7]
6326; AVX-NEXT:    vinsertps {{.*#+}} xmm3 = xmm1[0,1],xmm4[0],xmm1[3]
6327; AVX-NEXT:    vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5]
6328; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm8[5],xmm3[6,7]
6329; AVX-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6330; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
6331; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
6332; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7]
6333; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm5[0,0,1,1]
6334; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7]
6335; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6336; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
6337; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm6[0,0,1,1]
6338; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
6339; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
6340; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm7[0,1,0,1]
6341; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
6342; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
6343; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[2,1,3,3,4,5,6,7]
6344; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
6345; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3]
6346; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7]
6347; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
6348; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6],xmm2[7]
6349; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6350; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
6351; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5],xmm0[6,7]
6352; AVX-NEXT:    vpslld $16, %xmm5, %xmm1
6353; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
6354; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6355; AVX-NEXT:    vmovdqa 16(%rcx), %xmm0
6356; AVX-NEXT:    vmovdqa 16(%rdx), %xmm1
6357; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6358; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
6359; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm11[2,2,3,3]
6360; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
6361; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
6362; AVX-NEXT:    vmovdqa 16(%rsi), %xmm2
6363; AVX-NEXT:    vmovdqa 16(%rdi), %xmm3
6364; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
6365; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
6366; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm10[2,3,2,3]
6367; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,1]
6368; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
6369; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
6370; AVX-NEXT:    vmovdqa 16(%r8), %xmm3
6371; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7]
6372; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
6373; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm2[0],xmm1[1,2],xmm2[3]
6374; AVX-NEXT:    vmovdqa 16(%r9), %xmm2
6375; AVX-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm2[0,2,2,3,4,5,6,7]
6376; AVX-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
6377; AVX-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6],xmm6[7]
6378; AVX-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6379; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
6380; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7]
6381; AVX-NEXT:    vpslld $16, %xmm2, %xmm5
6382; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5],xmm1[6,7]
6383; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6384; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3]
6385; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1
6386; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2]
6387; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
6388; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
6389; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
6390; AVX-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
6391; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3]
6392; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,2,3,3]
6393; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7]
6394; AVX-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6395; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
6396; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,7,7]
6397; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
6398; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
6399; AVX-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7]
6400; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
6401; AVX-NEXT:    vpblendw {{.*#+}} xmm14 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7]
6402; AVX-NEXT:    vmovdqa (%rcx), %xmm9
6403; AVX-NEXT:    vmovdqa (%rdx), %xmm8
6404; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
6405; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm5[1,1,2,2]
6406; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm5[2,2,3,3]
6407; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
6408; AVX-NEXT:    vmovdqa (%rsi), %xmm7
6409; AVX-NEXT:    vmovdqa (%rdi), %xmm6
6410; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
6411; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3]
6412; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1
6413; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
6414; AVX-NEXT:    vmovdqa (%r8), %xmm1
6415; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
6416; AVX-NEXT:    vblendps {{.*#+}} xmm13 = xmm12[0],xmm0[1],xmm12[2,3]
6417; AVX-NEXT:    vmovdqa (%r9), %xmm0
6418; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm0[2,2,3,3]
6419; AVX-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3],xmm13[4,5,6,7]
6420; AVX-NEXT:    vextractf128 $1, %ymm12, %xmm12
6421; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,6,5,7,7]
6422; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3]
6423; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3,4,5],xmm15[6,7]
6424; AVX-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm0[0,1,2,3,4,6,6,7]
6425; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3]
6426; AVX-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1],xmm12[2,3,4,5,6],xmm15[7]
6427; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm11[0,0,1,1]
6428; AVX-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2]
6429; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm15, %ymm11
6430; AVX-NEXT:    vpshufd {{.*#+}} xmm15 = xmm10[0,1,0,1]
6431; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm15, %ymm10
6432; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6],ymm11[7]
6433; AVX-NEXT:    vinsertps {{.*#+}} xmm10 = xmm11[0,1],xmm3[0],xmm11[3]
6434; AVX-NEXT:    vpslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
6435; AVX-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm15[5],xmm10[6,7]
6436; AVX-NEXT:    vextractf128 $1, %ymm11, %xmm11
6437; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
6438; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm3[2,3],xmm11[4,5,6,7]
6439; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
6440; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7]
6441; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
6442; AVX-NEXT:    vpshufd {{.*#+}} xmm8 = xmm3[0,0,1,1]
6443; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm3[1,1,2,2]
6444; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm8, %ymm8
6445; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
6446; AVX-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[0,1,0,1]
6447; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm7
6448; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7]
6449; AVX-NEXT:    vinsertps {{.*#+}} xmm8 = xmm7[0,1],xmm1[0],xmm7[3]
6450; AVX-NEXT:    vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
6451; AVX-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5],xmm8[6,7]
6452; AVX-NEXT:    vextractf128 $1, %ymm7, %xmm7
6453; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
6454; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3],xmm7[4,5,6,7]
6455; AVX-NEXT:    vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1]
6456; AVX-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3],xmm7[4,5,6,7]
6457; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
6458; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1]
6459; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
6460; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
6461; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
6462; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
6463; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
6464; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm1[2,1,3,3,4,5,6,7]
6465; AVX-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
6466; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0],xmm3[1,2],xmm4[3]
6467; AVX-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7]
6468; AVX-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
6469; AVX-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6],xmm5[7]
6470; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm3
6471; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5],xmm3[6,7]
6472; AVX-NEXT:    vpslld $16, %xmm0, %xmm0
6473; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7]
6474; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6475; AVX-NEXT:    vmovdqa %xmm0, 48(%rax)
6476; AVX-NEXT:    vmovdqa %xmm4, 32(%rax)
6477; AVX-NEXT:    vmovdqa %xmm7, 16(%rax)
6478; AVX-NEXT:    vmovdqa %xmm8, (%rax)
6479; AVX-NEXT:    vmovdqa %xmm2, 112(%rax)
6480; AVX-NEXT:    vmovdqa %xmm10, 96(%rax)
6481; AVX-NEXT:    vmovdqa %xmm12, 80(%rax)
6482; AVX-NEXT:    vmovdqa %xmm13, 64(%rax)
6483; AVX-NEXT:    vmovdqa %xmm14, 176(%rax)
6484; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6485; AVX-NEXT:    vmovaps %xmm0, 160(%rax)
6486; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6487; AVX-NEXT:    vmovaps %xmm0, 144(%rax)
6488; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6489; AVX-NEXT:    vmovaps %xmm0, 128(%rax)
6490; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6491; AVX-NEXT:    vmovaps %xmm0, 624(%rax)
6492; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6493; AVX-NEXT:    vmovaps %xmm0, 608(%rax)
6494; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6495; AVX-NEXT:    vmovaps %xmm0, 592(%rax)
6496; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6497; AVX-NEXT:    vmovaps %xmm0, 576(%rax)
6498; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6499; AVX-NEXT:    vmovaps %xmm0, 688(%rax)
6500; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
6501; AVX-NEXT:    vmovaps %xmm0, 672(%rax)
6502; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6503; AVX-NEXT:    vmovaps %xmm0, 656(%rax)
6504; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6505; AVX-NEXT:    vmovaps %xmm0, 640(%rax)
6506; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6507; AVX-NEXT:    vmovaps %xmm0, 752(%rax)
6508; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6509; AVX-NEXT:    vmovaps %xmm0, 736(%rax)
6510; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6511; AVX-NEXT:    vmovaps %xmm0, 720(%rax)
6512; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6513; AVX-NEXT:    vmovaps %xmm0, 704(%rax)
6514; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6515; AVX-NEXT:    vmovaps %xmm0, 240(%rax)
6516; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6517; AVX-NEXT:    vmovaps %xmm0, 224(%rax)
6518; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6519; AVX-NEXT:    vmovaps %xmm0, 208(%rax)
6520; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6521; AVX-NEXT:    vmovaps %xmm0, 192(%rax)
6522; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6523; AVX-NEXT:    vmovaps %xmm0, 304(%rax)
6524; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6525; AVX-NEXT:    vmovaps %xmm0, 288(%rax)
6526; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6527; AVX-NEXT:    vmovaps %xmm0, 272(%rax)
6528; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6529; AVX-NEXT:    vmovaps %xmm0, 256(%rax)
6530; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6531; AVX-NEXT:    vmovaps %xmm0, 368(%rax)
6532; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6533; AVX-NEXT:    vmovaps %xmm0, 352(%rax)
6534; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6535; AVX-NEXT:    vmovaps %xmm0, 336(%rax)
6536; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6537; AVX-NEXT:    vmovaps %xmm0, 320(%rax)
6538; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6539; AVX-NEXT:    vmovaps %xmm0, 432(%rax)
6540; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6541; AVX-NEXT:    vmovaps %xmm0, 416(%rax)
6542; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6543; AVX-NEXT:    vmovaps %xmm0, 400(%rax)
6544; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6545; AVX-NEXT:    vmovaps %xmm0, 384(%rax)
6546; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6547; AVX-NEXT:    vmovaps %xmm0, 496(%rax)
6548; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6549; AVX-NEXT:    vmovaps %xmm0, 480(%rax)
6550; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6551; AVX-NEXT:    vmovaps %xmm0, 464(%rax)
6552; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6553; AVX-NEXT:    vmovaps %xmm0, 448(%rax)
6554; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6555; AVX-NEXT:    vmovaps %xmm0, 560(%rax)
6556; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6557; AVX-NEXT:    vmovaps %xmm0, 544(%rax)
6558; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6559; AVX-NEXT:    vmovaps %xmm0, 528(%rax)
6560; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6561; AVX-NEXT:    vmovaps %xmm0, 512(%rax)
6562; AVX-NEXT:    addq $504, %rsp # imm = 0x1F8
6563; AVX-NEXT:    vzeroupper
6564; AVX-NEXT:    retq
6565;
6566; AVX2-LABEL: store_i16_stride6_vf64:
6567; AVX2:       # %bb.0:
6568; AVX2-NEXT:    subq $1544, %rsp # imm = 0x608
6569; AVX2-NEXT:    vmovdqa (%rcx), %xmm12
6570; AVX2-NEXT:    vmovdqa 32(%rcx), %xmm5
6571; AVX2-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6572; AVX2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
6573; AVX2-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6574; AVX2-NEXT:    vmovdqa (%rdx), %xmm11
6575; AVX2-NEXT:    vmovdqa 32(%rdx), %xmm6
6576; AVX2-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6577; AVX2-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
6578; AVX2-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6579; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6580; AVX2-NEXT:    vpbroadcastq %xmm0, %ymm0
6581; AVX2-NEXT:    vmovdqa (%rsi), %xmm14
6582; AVX2-NEXT:    vmovdqa 32(%rsi), %xmm13
6583; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm14[0,1,2,1]
6584; AVX2-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
6585; AVX2-NEXT:    vmovdqa (%rdi), %xmm10
6586; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm8
6587; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm10[0,1,2,1]
6588; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
6589; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
6590; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
6591; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
6592; AVX2-NEXT:    vmovdqa (%r8), %xmm1
6593; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6594; AVX2-NEXT:    vmovdqa 32(%r8), %xmm4
6595; AVX2-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6596; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7]
6597; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
6598; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
6599; AVX2-NEXT:    vmovdqa (%r9), %xmm0
6600; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6601; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
6602; AVX2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
6603; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1]
6604; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
6605; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
6606; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6607; AVX2-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
6608; AVX2-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
6609; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6610; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm13[0,1,2,1]
6611; AVX2-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6612; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
6613; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm8[0,1,2,1]
6614; AVX2-NEXT:    vmovdqa %xmm8, %xmm5
6615; AVX2-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6616; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
6617; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
6618; AVX2-NEXT:    vmovdqa 32(%r9), %xmm3
6619; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6620; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
6621; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
6622; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
6623; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[2,1,3,3,4,5,6,7]
6624; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
6625; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
6626; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7]
6627; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
6628; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
6629; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
6630; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6631; AVX2-NEXT:    vmovdqa 64(%rcx), %xmm1
6632; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6633; AVX2-NEXT:    vmovdqa 64(%rdx), %xmm2
6634; AVX2-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6635; AVX2-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
6636; AVX2-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
6637; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6638; AVX2-NEXT:    vmovdqa 64(%rsi), %xmm9
6639; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm9[0,1,2,1]
6640; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
6641; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm3
6642; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6643; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
6644; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
6645; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
6646; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
6647; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
6648; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
6649; AVX2-NEXT:    vmovdqa 64(%r8), %xmm2
6650; AVX2-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6651; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7]
6652; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
6653; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
6654; AVX2-NEXT:    vmovdqa 64(%r9), %xmm2
6655; AVX2-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6656; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
6657; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
6658; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
6659; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
6660; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6661; AVX2-NEXT:    vmovdqa 96(%rcx), %xmm1
6662; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6663; AVX2-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
6664; AVX2-NEXT:    vmovdqa 96(%rdx), %xmm2
6665; AVX2-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6666; AVX2-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
6667; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6668; AVX2-NEXT:    vmovdqa 96(%rsi), %xmm2
6669; AVX2-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6670; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
6671; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
6672; AVX2-NEXT:    vmovdqa 96(%rdi), %xmm3
6673; AVX2-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6674; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
6675; AVX2-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
6676; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
6677; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
6678; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
6679; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
6680; AVX2-NEXT:    vmovdqa 96(%r8), %xmm2
6681; AVX2-NEXT:    vmovdqa %xmm2, (%rsp) # 16-byte Spill
6682; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7]
6683; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
6684; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
6685; AVX2-NEXT:    vmovdqa 96(%r9), %xmm2
6686; AVX2-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6687; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
6688; AVX2-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
6689; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
6690; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
6691; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6692; AVX2-NEXT:    vmovdqa (%rdx), %ymm2
6693; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6694; AVX2-NEXT:    vmovdqa (%rcx), %ymm1
6695; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6696; AVX2-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
6697; AVX2-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
6698; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
6699; AVX2-NEXT:    vmovdqa (%rsi), %ymm2
6700; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6701; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,1,2,3,6,5,6,7]
6702; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
6703; AVX2-NEXT:    vmovdqa (%rdi), %ymm3
6704; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6705; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,1,2,3,6,5,6,7]
6706; AVX2-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
6707; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
6708; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
6709; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
6710; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
6711; AVX2-NEXT:    vmovdqa (%r8), %ymm2
6712; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6713; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
6714; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
6715; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
6716; AVX2-NEXT:    vmovdqa (%r9), %ymm2
6717; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6718; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
6719; AVX2-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
6720; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
6721; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
6722; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6723; AVX2-NEXT:    vmovdqa 32(%rdx), %ymm2
6724; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6725; AVX2-NEXT:    vmovdqa 32(%rcx), %ymm1
6726; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6727; AVX2-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
6728; AVX2-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
6729; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
6730; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm7
6731; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm7[2,1,2,3,6,5,6,7]
6732; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6733; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
6734; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm15
6735; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm15[2,1,2,3,6,5,6,7]
6736; AVX2-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6737; AVX2-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
6738; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
6739; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
6740; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
6741; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
6742; AVX2-NEXT:    vmovdqa 32(%r8), %ymm2
6743; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6744; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
6745; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
6746; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
6747; AVX2-NEXT:    vmovdqa 32(%r9), %ymm2
6748; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6749; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
6750; AVX2-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
6751; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
6752; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
6753; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6754; AVX2-NEXT:    vmovdqa 64(%rdx), %ymm2
6755; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6756; AVX2-NEXT:    vmovdqa 64(%rcx), %ymm1
6757; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6758; AVX2-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
6759; AVX2-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
6760; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
6761; AVX2-NEXT:    vmovdqa 64(%rsi), %ymm2
6762; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6763; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,1,2,3,6,5,6,7]
6764; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
6765; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm3
6766; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6767; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,1,2,3,6,5,6,7]
6768; AVX2-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
6769; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
6770; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
6771; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
6772; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
6773; AVX2-NEXT:    vmovdqa 64(%r8), %ymm2
6774; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6775; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
6776; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
6777; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
6778; AVX2-NEXT:    vmovdqa 64(%r9), %ymm2
6779; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6780; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
6781; AVX2-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
6782; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
6783; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
6784; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6785; AVX2-NEXT:    vmovdqa 96(%rdx), %ymm3
6786; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6787; AVX2-NEXT:    vmovdqa 96(%rcx), %ymm1
6788; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6789; AVX2-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
6790; AVX2-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
6791; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
6792; AVX2-NEXT:    vmovdqa 96(%rsi), %ymm6
6793; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm6[2,1,2,3,6,5,6,7]
6794; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6795; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
6796; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm3
6797; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6798; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,1,2,3,6,5,6,7]
6799; AVX2-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
6800; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
6801; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
6802; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
6803; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
6804; AVX2-NEXT:    vmovdqa 96(%r8), %ymm8
6805; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
6806; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6807; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
6808; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
6809; AVX2-NEXT:    vmovdqa 96(%r9), %ymm2
6810; AVX2-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6811; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
6812; AVX2-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
6813; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
6814; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
6815; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6816; AVX2-NEXT:    vmovdqa %xmm14, %xmm4
6817; AVX2-NEXT:    vmovdqa %xmm10, %xmm3
6818; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
6819; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
6820; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
6821; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2]
6822; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
6823; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
6824; AVX2-NEXT:    vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
6825; AVX2-NEXT:    # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
6826; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
6827; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
6828; AVX2-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6829; AVX2-NEXT:    # xmm0 = mem[0,0,2,1,4,5,6,7]
6830; AVX2-NEXT:    vpbroadcastq %xmm0, %ymm2
6831; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
6832; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
6833; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6834; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
6835; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
6836; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6837; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm2 # 16-byte Folded Reload
6838; AVX2-NEXT:    # xmm2 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
6839; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,2]
6840; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
6841; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
6842; AVX2-NEXT:    vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
6843; AVX2-NEXT:    # xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
6844; AVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
6845; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
6846; AVX2-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
6847; AVX2-NEXT:    # xmm2 = mem[0,0,2,1,4,5,6,7]
6848; AVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
6849; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
6850; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6851; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6852; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3]
6853; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
6854; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6855; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
6856; AVX2-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
6857; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,2]
6858; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
6859; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
6860; AVX2-NEXT:    vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
6861; AVX2-NEXT:    # xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
6862; AVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
6863; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
6864; AVX2-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
6865; AVX2-NEXT:    # xmm2 = mem[0,0,2,1,4,5,6,7]
6866; AVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
6867; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
6868; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6869; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6870; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
6871; AVX2-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
6872; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
6873; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6874; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
6875; AVX2-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
6876; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,2]
6877; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
6878; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
6879; AVX2-NEXT:    vpmovzxwd (%rsp), %xmm2 # 16-byte Folded Reload
6880; AVX2-NEXT:    # xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
6881; AVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
6882; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
6883; AVX2-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
6884; AVX2-NEXT:    # xmm2 = mem[0,0,2,1,4,5,6,7]
6885; AVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
6886; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
6887; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6888; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6889; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6890; AVX2-NEXT:    # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11]
6891; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
6892; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6893; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
6894; AVX2-NEXT:    # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[8],mem[8],ymm2[9],mem[9],ymm2[10],mem[10],ymm2[11],mem[11]
6895; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6]
6896; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
6897; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
6898; AVX2-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
6899; AVX2-NEXT:    # ymm2 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
6900; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
6901; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
6902; AVX2-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
6903; AVX2-NEXT:    # ymm2 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
6904; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
6905; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
6906; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6907; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm7[0],ymm15[1],ymm7[1],ymm15[2],ymm7[2],ymm15[3],ymm7[3],ymm15[8],ymm7[8],ymm15[9],ymm7[9],ymm15[10],ymm7[10],ymm15[11],ymm7[11]
6908; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
6909; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6910; AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
6911; AVX2-NEXT:    # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[8],mem[8],ymm2[9],mem[9],ymm2[10],mem[10],ymm2[11],mem[11]
6912; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6]
6913; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
6914; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
6915; AVX2-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
6916; AVX2-NEXT:    # ymm2 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
6917; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
6918; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
6919; AVX2-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
6920; AVX2-NEXT:    # ymm2 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
6921; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
6922; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
6923; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6924; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
6925; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
6926; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11]
6927; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
6928; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
6929; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
6930; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11]
6931; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6]
6932; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
6933; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
6934; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
6935; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
6936; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
6937; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
6938; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
6939; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm15[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
6940; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
6941; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
6942; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6943; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
6944; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
6945; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
6946; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6947; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6948; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11]
6949; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6]
6950; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
6951; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
6952; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
6953; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
6954; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
6955; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
6956; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
6957; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
6958; AVX2-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
6959; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6960; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
6961; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6962; AVX2-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
6963; AVX2-NEXT:    # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
6964; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
6965; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3]
6966; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
6967; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
6968; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
6969; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6970; AVX2-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
6971; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
6972; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
6973; AVX2-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
6974; AVX2-NEXT:    # xmm2 = mem[2,3,2,3]
6975; AVX2-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,1,4,5,6,7]
6976; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[0,1,0,1]
6977; AVX2-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
6978; AVX2-NEXT:    vpblendvb %ymm4, %ymm0, %ymm3, %ymm0
6979; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6980; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6981; AVX2-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
6982; AVX2-NEXT:    # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
6983; AVX2-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload
6984; AVX2-NEXT:    # xmm3 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
6985; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
6986; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,2,3,3]
6987; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
6988; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7]
6989; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6990; AVX2-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
6991; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1]
6992; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7]
6993; AVX2-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
6994; AVX2-NEXT:    # xmm3 = mem[2,3,2,3]
6995; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,1,4,5,6,7]
6996; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1]
6997; AVX2-NEXT:    vpblendvb %ymm4, %ymm0, %ymm3, %ymm0
6998; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6999; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7000; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
7001; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7002; AVX2-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
7003; AVX2-NEXT:    # xmm3 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
7004; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
7005; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,2,3,3]
7006; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
7007; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7]
7008; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7009; AVX2-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
7010; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1]
7011; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7]
7012; AVX2-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7013; AVX2-NEXT:    # xmm3 = mem[2,3,2,3]
7014; AVX2-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,1,4,5,6,7]
7015; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1]
7016; AVX2-NEXT:    vpblendvb %ymm4, %ymm0, %ymm3, %ymm0
7017; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7018; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7019; AVX2-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
7020; AVX2-NEXT:    # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
7021; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7022; AVX2-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
7023; AVX2-NEXT:    # xmm3 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
7024; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
7025; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,2,3,3]
7026; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
7027; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7]
7028; AVX2-NEXT:    vmovdqa (%rsp), %xmm2 # 16-byte Reload
7029; AVX2-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
7030; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
7031; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
7032; AVX2-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7033; AVX2-NEXT:    # xmm1 = mem[2,3,2,3]
7034; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,1,4,5,6,7]
7035; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
7036; AVX2-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm9
7037; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7038; AVX2-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7039; AVX2-NEXT:    # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15]
7040; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7041; AVX2-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
7042; AVX2-NEXT:    # ymm1 = ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[12],mem[12],ymm1[13],mem[13],ymm1[14],mem[14],ymm1[15],mem[15]
7043; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3]
7044; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7]
7045; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
7046; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
7047; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
7048; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7049; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm3
7050; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
7051; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7]
7052; AVX2-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
7053; AVX2-NEXT:    # ymm3 = mem[2,3,2,3,6,7,6,7]
7054; AVX2-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
7055; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
7056; AVX2-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm5
7057; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7058; AVX2-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
7059; AVX2-NEXT:    # ymm1 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15]
7060; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7061; AVX2-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7062; AVX2-NEXT:    # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15]
7063; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
7064; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7]
7065; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
7066; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
7067; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7068; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
7069; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
7070; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
7071; AVX2-NEXT:    vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
7072; AVX2-NEXT:    # ymm1 = mem[2,3,2,3,6,7,6,7]
7073; AVX2-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
7074; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
7075; AVX2-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm3
7076; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15]
7077; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15]
7078; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3]
7079; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7]
7080; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
7081; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
7082; AVX2-NEXT:    vpshufb %ymm2, %ymm14, %ymm1
7083; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
7084; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
7085; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm15[2,3,2,3,6,7,6,7]
7086; AVX2-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
7087; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
7088; AVX2-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
7089; AVX2-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload
7090; AVX2-NEXT:    # ymm1 = ymm7[4],mem[4],ymm7[5],mem[5],ymm7[6],mem[6],ymm7[7],mem[7],ymm7[12],mem[12],ymm7[13],mem[13],ymm7[14],mem[14],ymm7[15],mem[15]
7091; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
7092; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm11 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15]
7093; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
7094; AVX2-NEXT:    vpshufd {{.*#+}} ymm11 = ymm11[1,2,3,3,5,6,7,7]
7095; AVX2-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3]
7096; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2],ymm11[3,4],ymm1[5],ymm11[6,7]
7097; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
7098; AVX2-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
7099; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
7100; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
7101; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[2,3,2,3,6,7,6,7]
7102; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
7103; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
7104; AVX2-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
7105; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
7106; AVX2-NEXT:    vmovdqa %ymm1, 736(%rax)
7107; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7108; AVX2-NEXT:    vmovaps %ymm1, 672(%rax)
7109; AVX2-NEXT:    vmovdqa %ymm0, 544(%rax)
7110; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7111; AVX2-NEXT:    vmovaps %ymm0, 480(%rax)
7112; AVX2-NEXT:    vmovdqa %ymm3, 352(%rax)
7113; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7114; AVX2-NEXT:    vmovaps %ymm0, 288(%rax)
7115; AVX2-NEXT:    vmovdqa %ymm5, 160(%rax)
7116; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7117; AVX2-NEXT:    vmovaps %ymm0, 96(%rax)
7118; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7119; AVX2-NEXT:    vmovaps %ymm0, 704(%rax)
7120; AVX2-NEXT:    vmovdqa %ymm9, 640(%rax)
7121; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7122; AVX2-NEXT:    vmovaps %ymm0, 576(%rax)
7123; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7124; AVX2-NEXT:    vmovaps %ymm0, 512(%rax)
7125; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7126; AVX2-NEXT:    vmovaps %ymm0, 448(%rax)
7127; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7128; AVX2-NEXT:    vmovaps %ymm0, 384(%rax)
7129; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7130; AVX2-NEXT:    vmovaps %ymm0, 320(%rax)
7131; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7132; AVX2-NEXT:    vmovaps %ymm0, 256(%rax)
7133; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7134; AVX2-NEXT:    vmovaps %ymm0, 192(%rax)
7135; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7136; AVX2-NEXT:    vmovaps %ymm0, 128(%rax)
7137; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7138; AVX2-NEXT:    vmovaps %ymm0, 64(%rax)
7139; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7140; AVX2-NEXT:    vmovaps %ymm0, (%rax)
7141; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7142; AVX2-NEXT:    vmovaps %ymm0, 608(%rax)
7143; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7144; AVX2-NEXT:    vmovaps %ymm0, 416(%rax)
7145; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7146; AVX2-NEXT:    vmovaps %ymm0, 224(%rax)
7147; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7148; AVX2-NEXT:    vmovaps %ymm0, 32(%rax)
7149; AVX2-NEXT:    addq $1544, %rsp # imm = 0x608
7150; AVX2-NEXT:    vzeroupper
7151; AVX2-NEXT:    retq
7152;
7153; AVX2-FP-LABEL: store_i16_stride6_vf64:
7154; AVX2-FP:       # %bb.0:
7155; AVX2-FP-NEXT:    subq $1544, %rsp # imm = 0x608
7156; AVX2-FP-NEXT:    vmovdqa (%rsi), %xmm0
7157; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7158; AVX2-FP-NEXT:    vmovdqa 32(%rsi), %xmm5
7159; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm1
7160; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7161; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm4
7162; AVX2-FP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7163; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7164; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
7165; AVX2-FP-NEXT:    vmovdqa (%rcx), %xmm1
7166; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7167; AVX2-FP-NEXT:    vmovdqa 32(%rcx), %xmm11
7168; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm2
7169; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7170; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %xmm6
7171; AVX2-FP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7172; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
7173; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2]
7174; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
7175; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
7176; AVX2-FP-NEXT:    vmovdqa (%r8), %xmm1
7177; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7178; AVX2-FP-NEXT:    vmovdqa 32(%r8), %xmm2
7179; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7180; AVX2-FP-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
7181; AVX2-FP-NEXT:    vpbroadcastq %xmm1, %ymm1
7182; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
7183; AVX2-FP-NEXT:    vmovdqa (%r9), %xmm1
7184; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7185; AVX2-FP-NEXT:    vmovdqa 32(%r9), %xmm3
7186; AVX2-FP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7187; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7]
7188; AVX2-FP-NEXT:    vpbroadcastq %xmm1, %ymm1
7189; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
7190; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm0, %ymm1, %ymm0
7191; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7192; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
7193; AVX2-FP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7194; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
7195; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3]
7196; AVX2-FP-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7197; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2]
7198; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
7199; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
7200; AVX2-FP-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
7201; AVX2-FP-NEXT:    vpbroadcastq %xmm1, %ymm1
7202; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
7203; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm3[0,0,2,1,4,5,6,7]
7204; AVX2-FP-NEXT:    vpbroadcastq %xmm1, %ymm1
7205; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm0, %ymm1, %ymm0
7206; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7207; AVX2-FP-NEXT:    vmovdqa 64(%rsi), %xmm12
7208; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm0
7209; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7210; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3]
7211; AVX2-FP-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7212; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
7213; AVX2-FP-NEXT:    vmovdqa 64(%rcx), %xmm1
7214; AVX2-FP-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
7215; AVX2-FP-NEXT:    vmovdqa 64(%rdx), %xmm2
7216; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7217; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
7218; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2]
7219; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
7220; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
7221; AVX2-FP-NEXT:    vmovdqa 64(%r8), %xmm1
7222; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7223; AVX2-FP-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
7224; AVX2-FP-NEXT:    vpbroadcastq %xmm1, %ymm1
7225; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
7226; AVX2-FP-NEXT:    vmovdqa 64(%r9), %xmm1
7227; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7228; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7]
7229; AVX2-FP-NEXT:    vpbroadcastq %xmm1, %ymm1
7230; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm0, %ymm1, %ymm0
7231; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7232; AVX2-FP-NEXT:    vmovdqa 96(%rsi), %xmm6
7233; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %xmm0
7234; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7235; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
7236; AVX2-FP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7237; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
7238; AVX2-FP-NEXT:    vmovdqa 96(%rcx), %xmm1
7239; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7240; AVX2-FP-NEXT:    vmovdqa 96(%rdx), %xmm2
7241; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7242; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
7243; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2]
7244; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
7245; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
7246; AVX2-FP-NEXT:    vmovdqa 96(%r8), %xmm1
7247; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7248; AVX2-FP-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
7249; AVX2-FP-NEXT:    vpbroadcastq %xmm1, %ymm1
7250; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
7251; AVX2-FP-NEXT:    vmovdqa 96(%r9), %xmm1
7252; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7253; AVX2-FP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7]
7254; AVX2-FP-NEXT:    vpbroadcastq %xmm1, %ymm1
7255; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm0, %ymm1, %ymm0
7256; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7257; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm14
7258; AVX2-FP-NEXT:    vmovdqa (%rsi), %ymm0
7259; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7260; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[8],ymm0[8],ymm14[9],ymm0[9],ymm14[10],ymm0[10],ymm14[11],ymm0[11]
7261; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
7262; AVX2-FP-NEXT:    vmovdqa (%rdx), %ymm13
7263; AVX2-FP-NEXT:    vmovdqa (%rcx), %ymm9
7264; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm9[0],ymm13[1],ymm9[1],ymm13[2],ymm9[2],ymm13[3],ymm9[3],ymm13[8],ymm9[8],ymm13[9],ymm9[9],ymm13[10],ymm9[10],ymm13[11],ymm9[11]
7265; AVX2-FP-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7266; AVX2-FP-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7267; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6]
7268; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
7269; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
7270; AVX2-FP-NEXT:    vmovdqa (%r8), %ymm1
7271; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7272; AVX2-FP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
7273; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
7274; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
7275; AVX2-FP-NEXT:    vmovdqa (%r9), %ymm1
7276; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7277; AVX2-FP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
7278; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
7279; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm0, %ymm1, %ymm0
7280; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7281; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm1
7282; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7283; AVX2-FP-NEXT:    vmovdqa 32(%rsi), %ymm0
7284; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7285; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
7286; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
7287; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %ymm15
7288; AVX2-FP-NEXT:    vmovdqa 32(%rcx), %ymm1
7289; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7290; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[1],ymm1[1],ymm15[2],ymm1[2],ymm15[3],ymm1[3],ymm15[8],ymm1[8],ymm15[9],ymm1[9],ymm15[10],ymm1[10],ymm15[11],ymm1[11]
7291; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6]
7292; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
7293; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
7294; AVX2-FP-NEXT:    vmovdqa 32(%r8), %ymm1
7295; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7296; AVX2-FP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
7297; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
7298; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
7299; AVX2-FP-NEXT:    vmovdqa 32(%r9), %ymm1
7300; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7301; AVX2-FP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
7302; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
7303; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm0, %ymm1, %ymm0
7304; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7305; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm1
7306; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7307; AVX2-FP-NEXT:    vmovdqa 64(%rsi), %ymm0
7308; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7309; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
7310; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
7311; AVX2-FP-NEXT:    vmovdqa 64(%rdx), %ymm8
7312; AVX2-FP-NEXT:    vmovdqa 64(%rcx), %ymm1
7313; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7314; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11]
7315; AVX2-FP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7316; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6]
7317; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
7318; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
7319; AVX2-FP-NEXT:    vmovdqa 64(%r8), %ymm1
7320; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7321; AVX2-FP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
7322; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
7323; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
7324; AVX2-FP-NEXT:    vmovdqa 64(%r9), %ymm1
7325; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7326; AVX2-FP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
7327; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
7328; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm0, %ymm1, %ymm0
7329; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7330; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm0
7331; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7332; AVX2-FP-NEXT:    vmovdqa 96(%rsi), %ymm7
7333; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[1],ymm7[1],ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[8],ymm7[8],ymm0[9],ymm7[9],ymm0[10],ymm7[10],ymm0[11],ymm7[11]
7334; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7335; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
7336; AVX2-FP-NEXT:    vmovdqa 96(%rdx), %ymm2
7337; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7338; AVX2-FP-NEXT:    vmovdqa 96(%rcx), %ymm1
7339; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7340; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
7341; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6]
7342; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
7343; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
7344; AVX2-FP-NEXT:    vmovdqa 96(%r8), %ymm1
7345; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7346; AVX2-FP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
7347; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
7348; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
7349; AVX2-FP-NEXT:    vmovdqa 96(%r9), %ymm1
7350; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7351; AVX2-FP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
7352; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
7353; AVX2-FP-NEXT:    vpblendvb %ymm10, %ymm0, %ymm1, %ymm0
7354; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7355; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
7356; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7357; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
7358; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7359; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
7360; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
7361; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7362; AVX2-FP-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
7363; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7364; AVX2-FP-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
7365; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
7366; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
7367; AVX2-FP-NEXT:    vpbroadcastq %xmm2, %ymm2
7368; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
7369; AVX2-FP-NEXT:    vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7370; AVX2-FP-NEXT:    # xmm2 = mem[2,1,3,3,4,5,6,7]
7371; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
7372; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
7373; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
7374; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7375; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
7376; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
7377; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
7378; AVX2-FP-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
7379; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7380; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm5, %xmm2
7381; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7382; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
7383; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
7384; AVX2-FP-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
7385; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7386; AVX2-FP-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
7387; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
7388; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
7389; AVX2-FP-NEXT:    vpbroadcastq %xmm3, %ymm3
7390; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
7391; AVX2-FP-NEXT:    vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7392; AVX2-FP-NEXT:    # xmm3 = mem[2,1,3,3,4,5,6,7]
7393; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
7394; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
7395; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7396; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
7397; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
7398; AVX2-FP-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
7399; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7400; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm12, %xmm2
7401; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7402; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
7403; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
7404; AVX2-FP-NEXT:    vmovdqa (%rsp), %xmm3 # 16-byte Reload
7405; AVX2-FP-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
7406; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7407; AVX2-FP-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
7408; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
7409; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
7410; AVX2-FP-NEXT:    vpbroadcastq %xmm3, %ymm3
7411; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
7412; AVX2-FP-NEXT:    vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7413; AVX2-FP-NEXT:    # xmm3 = mem[2,1,3,3,4,5,6,7]
7414; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
7415; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
7416; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7417; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
7418; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
7419; AVX2-FP-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
7420; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7421; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm6, %xmm2
7422; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7423; AVX2-FP-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
7424; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
7425; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7426; AVX2-FP-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
7427; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7428; AVX2-FP-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
7429; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
7430; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
7431; AVX2-FP-NEXT:    vpbroadcastq %xmm2, %ymm2
7432; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
7433; AVX2-FP-NEXT:    vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7434; AVX2-FP-NEXT:    # xmm2 = mem[2,1,3,3,4,5,6,7]
7435; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
7436; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7]
7437; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7438; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
7439; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
7440; AVX2-FP-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
7441; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7442; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
7443; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7444; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
7445; AVX2-FP-NEXT:    vmovdqa %ymm14, %ymm6
7446; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm14, %ymm2
7447; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
7448; AVX2-FP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
7449; AVX2-FP-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
7450; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
7451; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
7452; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
7453; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
7454; AVX2-FP-NEXT:    vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
7455; AVX2-FP-NEXT:    # ymm2 = mem[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
7456; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
7457; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
7458; AVX2-FP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
7459; AVX2-FP-NEXT:    # ymm1 = mem[0,1,0,1]
7460; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7461; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
7462; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
7463; AVX2-FP-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
7464; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7465; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7466; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
7467; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7468; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm3, %ymm3
7469; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
7470; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7471; AVX2-FP-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
7472; AVX2-FP-NEXT:    vmovdqa %ymm15, %ymm13
7473; AVX2-FP-NEXT:    vpsrldq {{.*#+}} ymm5 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
7474; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11]
7475; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
7476; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
7477; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
7478; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
7479; AVX2-FP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm14[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
7480; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
7481; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
7482; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
7483; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm15, %ymm3
7484; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
7485; AVX2-FP-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
7486; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7487; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7488; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
7489; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7490; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm3, %ymm3
7491; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
7492; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
7493; AVX2-FP-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
7494; AVX2-FP-NEXT:    vpsrldq {{.*#+}} ymm5 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
7495; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11]
7496; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
7497; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
7498; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
7499; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
7500; AVX2-FP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm11[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
7501; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
7502; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
7503; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7504; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
7505; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
7506; AVX2-FP-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
7507; AVX2-FP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7508; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm7, %ymm2
7509; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
7510; AVX2-FP-NEXT:    vpshufb %ymm0, %ymm10, %ymm0
7511; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
7512; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7513; AVX2-FP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
7514; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
7515; AVX2-FP-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
7516; AVX2-FP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
7517; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
7518; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
7519; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
7520; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
7521; AVX2-FP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
7522; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
7523; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7]
7524; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
7525; AVX2-FP-NEXT:    vpshufb %ymm1, %ymm9, %ymm1
7526; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
7527; AVX2-FP-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
7528; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7529; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7530; AVX2-FP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
7531; AVX2-FP-NEXT:    # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
7532; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7533; AVX2-FP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
7534; AVX2-FP-NEXT:    # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
7535; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
7536; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3]
7537; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
7538; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
7539; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
7540; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7541; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
7542; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
7543; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
7544; AVX2-FP-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
7545; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7546; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
7547; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[0,1,0,1]
7548; AVX2-FP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
7549; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm0
7550; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7551; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7552; AVX2-FP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
7553; AVX2-FP-NEXT:    # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
7554; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7555; AVX2-FP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
7556; AVX2-FP-NEXT:    # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
7557; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[1,1,1,1]
7558; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,2,3,3]
7559; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
7560; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
7561; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7562; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm0, %xmm4
7563; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
7564; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
7565; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7566; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm0, %xmm4
7567; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
7568; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm0
7569; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7570; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7571; AVX2-FP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
7572; AVX2-FP-NEXT:    # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
7573; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7574; AVX2-FP-NEXT:    vpunpckhwd (%rsp), %xmm0, %xmm4 # 16-byte Folded Reload
7575; AVX2-FP-NEXT:    # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
7576; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[1,1,1,1]
7577; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,2,3,3]
7578; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
7579; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
7580; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7581; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm0, %xmm4
7582; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
7583; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
7584; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7585; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm0, %xmm4
7586; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
7587; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm0
7588; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7589; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7590; AVX2-FP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
7591; AVX2-FP-NEXT:    # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
7592; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7593; AVX2-FP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
7594; AVX2-FP-NEXT:    # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
7595; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[1,1,1,1]
7596; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,2,3,3]
7597; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
7598; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
7599; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7600; AVX2-FP-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
7601; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
7602; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7]
7603; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7604; AVX2-FP-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
7605; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
7606; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm1, %ymm2, %ymm0
7607; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7608; AVX2-FP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload
7609; AVX2-FP-NEXT:    # ymm1 = ymm6[4],mem[4],ymm6[5],mem[5],ymm6[6],mem[6],ymm6[7],mem[7],ymm6[12],mem[12],ymm6[13],mem[13],ymm6[14],mem[14],ymm6[15],mem[15]
7610; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7611; AVX2-FP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
7612; AVX2-FP-NEXT:    # ymm2 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15]
7613; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
7614; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,2,3,3,5,6,7,7]
7615; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
7616; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
7617; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
7618; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7619; AVX2-FP-NEXT:    vpshufb %ymm6, %ymm0, %ymm3
7620; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
7621; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7]
7622; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
7623; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7624; AVX2-FP-NEXT:    vpshufb %ymm2, %ymm0, %ymm4
7625; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3]
7626; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm4
7627; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7628; AVX2-FP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
7629; AVX2-FP-NEXT:    # ymm3 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15]
7630; AVX2-FP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload
7631; AVX2-FP-NEXT:    # ymm0 = ymm13[4],mem[4],ymm13[5],mem[5],ymm13[6],mem[6],ymm13[7],mem[7],ymm13[12],mem[12],ymm13[13],mem[13],ymm13[14],mem[14],ymm13[15],mem[15]
7632; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3]
7633; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7]
7634; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
7635; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
7636; AVX2-FP-NEXT:    vpshufb %ymm6, %ymm14, %ymm3
7637; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
7638; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7]
7639; AVX2-FP-NEXT:    vpshufb %ymm2, %ymm15, %ymm3
7640; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
7641; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm0, %ymm3, %ymm3
7642; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7643; AVX2-FP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7644; AVX2-FP-NEXT:    # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15]
7645; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7646; AVX2-FP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm12[4],ymm1[5],ymm12[5],ymm1[6],ymm12[6],ymm1[7],ymm12[7],ymm1[12],ymm12[12],ymm1[13],ymm12[13],ymm1[14],ymm12[14],ymm1[15],ymm12[15]
7647; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3]
7648; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7]
7649; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
7650; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
7651; AVX2-FP-NEXT:    vpshufb %ymm6, %ymm11, %ymm1
7652; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
7653; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
7654; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7655; AVX2-FP-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
7656; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
7657; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm0, %ymm1, %ymm0
7658; AVX2-FP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload
7659; AVX2-FP-NEXT:    # ymm1 = ymm10[4],mem[4],ymm10[5],mem[5],ymm10[6],mem[6],ymm10[7],mem[7],ymm10[12],mem[12],ymm10[13],mem[13],ymm10[14],mem[14],ymm10[15],mem[15]
7660; AVX2-FP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload
7661; AVX2-FP-NEXT:    # ymm11 = ymm7[4],mem[4],ymm7[5],mem[5],ymm7[6],mem[6],ymm7[7],mem[7],ymm7[12],mem[12],ymm7[13],mem[13],ymm7[14],mem[14],ymm7[15],mem[15]
7662; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
7663; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm11[1,2,3,3,5,6,7,7]
7664; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3]
7665; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2],ymm11[3,4],ymm1[5],ymm11[6,7]
7666; AVX2-FP-NEXT:    vpshufb %ymm6, %ymm8, %ymm6
7667; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
7668; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6],ymm6[7]
7669; AVX2-FP-NEXT:    vpshufb %ymm2, %ymm9, %ymm2
7670; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
7671; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
7672; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
7673; AVX2-FP-NEXT:    vmovdqa %ymm1, 736(%rax)
7674; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7675; AVX2-FP-NEXT:    vmovaps %ymm1, 704(%rax)
7676; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7677; AVX2-FP-NEXT:    vmovaps %ymm1, 672(%rax)
7678; AVX2-FP-NEXT:    vmovdqa %ymm0, 544(%rax)
7679; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7680; AVX2-FP-NEXT:    vmovaps %ymm0, 512(%rax)
7681; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7682; AVX2-FP-NEXT:    vmovaps %ymm0, 480(%rax)
7683; AVX2-FP-NEXT:    vmovdqa %ymm3, 352(%rax)
7684; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7685; AVX2-FP-NEXT:    vmovaps %ymm0, 320(%rax)
7686; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7687; AVX2-FP-NEXT:    vmovaps %ymm0, 288(%rax)
7688; AVX2-FP-NEXT:    vmovdqa %ymm4, 160(%rax)
7689; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7690; AVX2-FP-NEXT:    vmovaps %ymm0, 128(%rax)
7691; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7692; AVX2-FP-NEXT:    vmovaps %ymm0, 96(%rax)
7693; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7694; AVX2-FP-NEXT:    vmovaps %ymm0, 640(%rax)
7695; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7696; AVX2-FP-NEXT:    vmovaps %ymm0, 608(%rax)
7697; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7698; AVX2-FP-NEXT:    vmovaps %ymm0, 576(%rax)
7699; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7700; AVX2-FP-NEXT:    vmovaps %ymm0, 448(%rax)
7701; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7702; AVX2-FP-NEXT:    vmovaps %ymm0, 416(%rax)
7703; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7704; AVX2-FP-NEXT:    vmovaps %ymm0, 384(%rax)
7705; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7706; AVX2-FP-NEXT:    vmovaps %ymm0, 256(%rax)
7707; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7708; AVX2-FP-NEXT:    vmovaps %ymm0, 224(%rax)
7709; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7710; AVX2-FP-NEXT:    vmovaps %ymm0, 192(%rax)
7711; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7712; AVX2-FP-NEXT:    vmovaps %ymm0, 64(%rax)
7713; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7714; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%rax)
7715; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7716; AVX2-FP-NEXT:    vmovaps %ymm0, (%rax)
7717; AVX2-FP-NEXT:    addq $1544, %rsp # imm = 0x608
7718; AVX2-FP-NEXT:    vzeroupper
7719; AVX2-FP-NEXT:    retq
7720;
7721; AVX2-FCP-LABEL: store_i16_stride6_vf64:
7722; AVX2-FCP:       # %bb.0:
7723; AVX2-FCP-NEXT:    subq $1560, %rsp # imm = 0x618
7724; AVX2-FCP-NEXT:    vmovdqa (%rsi), %xmm1
7725; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7726; AVX2-FCP-NEXT:    vmovdqa 32(%rsi), %xmm5
7727; AVX2-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7728; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
7729; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
7730; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm2
7731; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7732; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm7
7733; AVX2-FCP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7734; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
7735; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
7736; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
7737; AVX2-FCP-NEXT:    vmovdqa (%rcx), %xmm2
7738; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7739; AVX2-FCP-NEXT:    vmovdqa 32(%rcx), %xmm8
7740; AVX2-FCP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7741; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
7742; AVX2-FCP-NEXT:    vmovdqa (%rdx), %xmm3
7743; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7744; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %xmm14
7745; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
7746; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
7747; AVX2-FCP-NEXT:    vpbroadcastq %xmm2, %ymm2
7748; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
7749; AVX2-FCP-NEXT:    vmovdqa (%r8), %xmm2
7750; AVX2-FCP-NEXT:    vmovdqa %xmm2, (%rsp) # 16-byte Spill
7751; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7]
7752; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
7753; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7]
7754; AVX2-FCP-NEXT:    vmovdqa (%r9), %xmm1
7755; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7756; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
7757; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
7758; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm1[0,0,2,1]
7759; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
7760; AVX2-FCP-NEXT:    vpblendvb %ymm1, %ymm3, %ymm4, %ymm3
7761; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7762; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm5, %xmm3
7763; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm7, %xmm4
7764; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
7765; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
7766; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
7767; AVX2-FCP-NEXT:    vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7768; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
7769; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1]
7770; AVX2-FCP-NEXT:    vpbroadcastq %xmm4, %ymm4
7771; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
7772; AVX2-FCP-NEXT:    vmovdqa 32(%r8), %xmm4
7773; AVX2-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7774; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[2,1,3,3,4,5,6,7]
7775; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
7776; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
7777; AVX2-FCP-NEXT:    vmovdqa 32(%r9), %xmm4
7778; AVX2-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7779; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
7780; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
7781; AVX2-FCP-NEXT:    vpblendvb %ymm1, %ymm3, %ymm4, %ymm3
7782; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7783; AVX2-FCP-NEXT:    vmovdqa 64(%rsi), %xmm3
7784; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7785; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
7786; AVX2-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7787; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
7788; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
7789; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
7790; AVX2-FCP-NEXT:    vmovdqa 64(%rcx), %xmm4
7791; AVX2-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7792; AVX2-FCP-NEXT:    vmovdqa 64(%rdx), %xmm5
7793; AVX2-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7794; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
7795; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
7796; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
7797; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1]
7798; AVX2-FCP-NEXT:    vpbroadcastq %xmm4, %ymm4
7799; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
7800; AVX2-FCP-NEXT:    vmovdqa 64(%r8), %xmm4
7801; AVX2-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7802; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[2,1,3,3,4,5,6,7]
7803; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
7804; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
7805; AVX2-FCP-NEXT:    vmovdqa 64(%r9), %xmm4
7806; AVX2-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7807; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
7808; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
7809; AVX2-FCP-NEXT:    vpblendvb %ymm1, %ymm3, %ymm4, %ymm3
7810; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7811; AVX2-FCP-NEXT:    vmovdqa 96(%rsi), %xmm3
7812; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7813; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
7814; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %xmm4
7815; AVX2-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7816; AVX2-FCP-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
7817; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
7818; AVX2-FCP-NEXT:    vmovdqa 96(%rcx), %xmm13
7819; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
7820; AVX2-FCP-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7821; AVX2-FCP-NEXT:    vmovdqa 96(%rdx), %xmm12
7822; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
7823; AVX2-FCP-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7824; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
7825; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
7826; AVX2-FCP-NEXT:    vpbroadcastq %xmm3, %ymm3
7827; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
7828; AVX2-FCP-NEXT:    vmovdqa 96(%r8), %xmm3
7829; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7830; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7]
7831; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
7832; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7]
7833; AVX2-FCP-NEXT:    vmovdqa 96(%r9), %xmm3
7834; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7835; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
7836; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
7837; AVX2-FCP-NEXT:    vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
7838; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7839; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm3
7840; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7841; AVX2-FCP-NEXT:    vmovdqa (%rsi), %ymm2
7842; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7843; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
7844; AVX2-FCP-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
7845; AVX2-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm3
7846; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
7847; AVX2-FCP-NEXT:    vmovdqa (%rdx), %ymm4
7848; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7849; AVX2-FCP-NEXT:    vmovdqa (%rcx), %ymm3
7850; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7851; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
7852; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
7853; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
7854; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
7855; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
7856; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
7857; AVX2-FCP-NEXT:    vmovdqa (%r8), %ymm3
7858; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7859; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
7860; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
7861; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
7862; AVX2-FCP-NEXT:    vmovdqa (%r9), %ymm4
7863; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7864; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
7865; AVX2-FCP-NEXT:    # ymm2 = mem[0,1,0,1]
7866; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm4
7867; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
7868; AVX2-FCP-NEXT:    vpblendvb %ymm1, %ymm3, %ymm4, %ymm3
7869; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7870; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm4
7871; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7872; AVX2-FCP-NEXT:    vmovdqa 32(%rsi), %ymm3
7873; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7874; AVX2-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm3
7875; AVX2-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm4
7876; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
7877; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %ymm10
7878; AVX2-FCP-NEXT:    vmovdqa 32(%rcx), %ymm11
7879; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
7880; AVX2-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7881; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} ymm5 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
7882; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7883; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11]
7884; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
7885; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
7886; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
7887; AVX2-FCP-NEXT:    vmovdqa 32(%r8), %ymm4
7888; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7889; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
7890; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
7891; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
7892; AVX2-FCP-NEXT:    vmovdqa 32(%r9), %ymm4
7893; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7894; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm4
7895; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
7896; AVX2-FCP-NEXT:    vpblendvb %ymm1, %ymm3, %ymm4, %ymm3
7897; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7898; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm5
7899; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7900; AVX2-FCP-NEXT:    vmovdqa 64(%rsi), %ymm9
7901; AVX2-FCP-NEXT:    vpshufb %ymm0, %ymm9, %ymm3
7902; AVX2-FCP-NEXT:    vpshufb %ymm0, %ymm5, %ymm4
7903; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
7904; AVX2-FCP-NEXT:    vmovdqa 64(%rdx), %ymm15
7905; AVX2-FCP-NEXT:    vmovdqa 64(%rcx), %ymm4
7906; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7907; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
7908; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} ymm5 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
7909; AVX2-FCP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7910; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11]
7911; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
7912; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
7913; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
7914; AVX2-FCP-NEXT:    vmovdqa 64(%r8), %ymm4
7915; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7916; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
7917; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
7918; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
7919; AVX2-FCP-NEXT:    vmovdqa 64(%r9), %ymm4
7920; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7921; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm4
7922; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
7923; AVX2-FCP-NEXT:    vpblendvb %ymm1, %ymm3, %ymm4, %ymm3
7924; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7925; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm4
7926; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7927; AVX2-FCP-NEXT:    vmovdqa 96(%rsi), %ymm3
7928; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7929; AVX2-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm3
7930; AVX2-FCP-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
7931; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11]
7932; AVX2-FCP-NEXT:    vmovdqa 96(%rdx), %ymm8
7933; AVX2-FCP-NEXT:    vmovdqa 96(%rcx), %ymm7
7934; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
7935; AVX2-FCP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7936; AVX2-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
7937; AVX2-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7938; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
7939; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
7940; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
7941; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
7942; AVX2-FCP-NEXT:    vmovdqa 96(%r8), %ymm6
7943; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
7944; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7945; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
7946; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7]
7947; AVX2-FCP-NEXT:    vmovdqa 96(%r9), %ymm3
7948; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7949; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
7950; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
7951; AVX2-FCP-NEXT:    vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
7952; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7953; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7954; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7955; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
7956; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [1,0,2,2,1,0,2,2]
7957; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
7958; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm1, %ymm0
7959; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7960; AVX2-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
7961; AVX2-FCP-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
7962; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
7963; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
7964; AVX2-FCP-NEXT:    vpmovzxwd (%rsp), %xmm2 # 16-byte Folded Reload
7965; AVX2-FCP-NEXT:    # xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
7966; AVX2-FCP-NEXT:    vpbroadcastq %xmm2, %ymm2
7967; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
7968; AVX2-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7969; AVX2-FCP-NEXT:    # xmm0 = mem[0,0,2,1,4,5,6,7]
7970; AVX2-FCP-NEXT:    vpbroadcastq %xmm0, %ymm3
7971; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
7972; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm2
7973; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7974; AVX2-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload
7975; AVX2-FCP-NEXT:    # xmm2 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3]
7976; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm2
7977; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
7978; AVX2-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload
7979; AVX2-FCP-NEXT:    # xmm3 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3]
7980; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
7981; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
7982; AVX2-FCP-NEXT:    vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7983; AVX2-FCP-NEXT:    # xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
7984; AVX2-FCP-NEXT:    vpbroadcastq %xmm3, %ymm3
7985; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
7986; AVX2-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7987; AVX2-FCP-NEXT:    # xmm3 = mem[0,0,2,1,4,5,6,7]
7988; AVX2-FCP-NEXT:    vpbroadcastq %xmm3, %ymm3
7989; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm2
7990; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7991; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7992; AVX2-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
7993; AVX2-FCP-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
7994; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm2
7995; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7996; AVX2-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
7997; AVX2-FCP-NEXT:    # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
7998; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
7999; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
8000; AVX2-FCP-NEXT:    vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8001; AVX2-FCP-NEXT:    # xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
8002; AVX2-FCP-NEXT:    vpbroadcastq %xmm3, %ymm3
8003; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
8004; AVX2-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8005; AVX2-FCP-NEXT:    # xmm3 = mem[0,0,2,1,4,5,6,7]
8006; AVX2-FCP-NEXT:    vpbroadcastq %xmm3, %ymm3
8007; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm2
8008; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8009; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
8010; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm1
8011; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8012; AVX2-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
8013; AVX2-FCP-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
8014; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
8015; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
8016; AVX2-FCP-NEXT:    vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8017; AVX2-FCP-NEXT:    # xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
8018; AVX2-FCP-NEXT:    vpbroadcastq %xmm2, %ymm2
8019; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
8020; AVX2-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8021; AVX2-FCP-NEXT:    # xmm2 = mem[0,0,2,1,4,5,6,7]
8022; AVX2-FCP-NEXT:    vpbroadcastq %xmm2, %ymm2
8023; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
8024; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8025; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8026; AVX2-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
8027; AVX2-FCP-NEXT:    # ymm2 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11]
8028; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [5,4,2,2,5,4,6,6]
8029; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm2
8030; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
8031; AVX2-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
8032; AVX2-FCP-NEXT:    # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11]
8033; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
8034; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
8035; AVX2-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
8036; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
8037; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
8038; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
8039; AVX2-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
8040; AVX2-FCP-NEXT:    # ymm3 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
8041; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
8042; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm2
8043; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8044; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11]
8045; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm2
8046; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
8047; AVX2-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
8048; AVX2-FCP-NEXT:    # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11]
8049; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
8050; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
8051; AVX2-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
8052; AVX2-FCP-NEXT:    # ymm3 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
8053; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
8054; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
8055; AVX2-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
8056; AVX2-FCP-NEXT:    # ymm3 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
8057; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
8058; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm2
8059; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8060; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
8061; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11]
8062; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm2
8063; AVX2-FCP-NEXT:    vmovdqa %ymm9, %ymm12
8064; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
8065; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11]
8066; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
8067; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
8068; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
8069; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
8070; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
8071; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
8072; AVX2-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
8073; AVX2-FCP-NEXT:    # ymm3 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
8074; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
8075; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm2
8076; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8077; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
8078; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm1
8079; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
8080; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
8081; AVX2-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
8082; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
8083; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
8084; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
8085; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
8086; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
8087; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
8088; AVX2-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm9[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
8089; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
8090; AVX2-FCP-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
8091; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8092; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
8093; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8094; AVX2-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
8095; AVX2-FCP-NEXT:    # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
8096; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,2,1,2,0,0,3,3]
8097; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm3, %ymm0
8098; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1]
8099; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
8100; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
8101; AVX2-FCP-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
8102; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
8103; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
8104; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
8105; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
8106; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8107; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
8108; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm0[0,1,0,1]
8109; AVX2-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
8110; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm0
8111; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8112; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8113; AVX2-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
8114; AVX2-FCP-NEXT:    # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8115; AVX2-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload
8116; AVX2-FCP-NEXT:    # xmm5 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7]
8117; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm3, %ymm4
8118; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[1,1,1,1]
8119; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
8120; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8121; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm5
8122; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
8123; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7]
8124; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8125; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm5
8126; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
8127; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm0
8128; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8129; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8130; AVX2-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
8131; AVX2-FCP-NEXT:    # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8132; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8133; AVX2-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
8134; AVX2-FCP-NEXT:    # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8135; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm3, %ymm4
8136; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[1,1,1,1]
8137; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
8138; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8139; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm5
8140; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
8141; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7]
8142; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8143; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm5
8144; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
8145; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm0
8146; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8147; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8148; AVX2-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
8149; AVX2-FCP-NEXT:    # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8150; AVX2-FCP-NEXT:    vpermd %ymm4, %ymm3, %ymm3
8151; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8152; AVX2-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
8153; AVX2-FCP-NEXT:    # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8154; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1]
8155; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
8156; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8157; AVX2-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
8158; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
8159; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
8160; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8161; AVX2-FCP-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
8162; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
8163; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm2, %ymm1, %ymm10
8164; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8165; AVX2-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
8166; AVX2-FCP-NEXT:    # ymm1 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15]
8167; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8168; AVX2-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
8169; AVX2-FCP-NEXT:    # ymm2 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15]
8170; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [5,6,5,6,5,6,7,7]
8171; AVX2-FCP-NEXT:    vpermd %ymm1, %ymm3, %ymm1
8172; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3]
8173; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
8174; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
8175; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8176; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm0, %ymm4
8177; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3]
8178; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7]
8179; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
8180; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8181; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm0, %ymm14
8182; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
8183; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm4, %ymm14, %ymm5
8184; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8185; AVX2-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
8186; AVX2-FCP-NEXT:    # ymm14 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15]
8187; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8188; AVX2-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8189; AVX2-FCP-NEXT:    # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15]
8190; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm3, %ymm14
8191; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3]
8192; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7]
8193; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
8194; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm14
8195; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
8196; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3],ymm14[4],ymm0[5,6],ymm14[7]
8197; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
8198; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm4, %ymm14
8199; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
8200; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm0, %ymm14, %ymm4
8201; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8202; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm14 = ymm0[4],ymm13[4],ymm0[5],ymm13[5],ymm0[6],ymm13[6],ymm0[7],ymm13[7],ymm0[12],ymm13[12],ymm0[13],ymm13[13],ymm0[14],ymm13[14],ymm0[15],ymm13[15]
8203; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15]
8204; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm3, %ymm14
8205; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3]
8206; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7]
8207; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm15, %ymm14
8208; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
8209; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3],ymm14[4],ymm0[5,6],ymm14[7]
8210; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
8211; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm11, %ymm14
8212; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
8213; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm0, %ymm14, %ymm0
8214; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
8215; AVX2-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload
8216; AVX2-FCP-NEXT:    # ymm14 = ymm11[4],mem[4],ymm11[5],mem[5],ymm11[6],mem[6],ymm11[7],mem[7],ymm11[12],mem[12],ymm11[13],mem[13],ymm11[14],mem[14],ymm11[15],mem[15]
8217; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm3, %ymm3
8218; AVX2-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm14 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15]
8219; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[3,3,3,3]
8220; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2],ymm3[3,4],ymm14[5],ymm3[6,7]
8221; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
8222; AVX2-FCP-NEXT:    vpshufb %ymm2, %ymm7, %ymm2
8223; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
8224; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
8225; AVX2-FCP-NEXT:    vpshufb %ymm1, %ymm9, %ymm1
8226; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
8227; AVX2-FCP-NEXT:    vpblendvb %ymm6, %ymm2, %ymm1, %ymm1
8228; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
8229; AVX2-FCP-NEXT:    vmovdqa %ymm1, 736(%rax)
8230; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8231; AVX2-FCP-NEXT:    vmovaps %ymm1, 672(%rax)
8232; AVX2-FCP-NEXT:    vmovdqa %ymm0, 544(%rax)
8233; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8234; AVX2-FCP-NEXT:    vmovaps %ymm0, 480(%rax)
8235; AVX2-FCP-NEXT:    vmovdqa %ymm4, 352(%rax)
8236; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8237; AVX2-FCP-NEXT:    vmovaps %ymm0, 288(%rax)
8238; AVX2-FCP-NEXT:    vmovdqa %ymm5, 160(%rax)
8239; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8240; AVX2-FCP-NEXT:    vmovaps %ymm0, 96(%rax)
8241; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8242; AVX2-FCP-NEXT:    vmovaps %ymm0, 704(%rax)
8243; AVX2-FCP-NEXT:    vmovdqa %ymm10, 640(%rax)
8244; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8245; AVX2-FCP-NEXT:    vmovaps %ymm0, 576(%rax)
8246; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8247; AVX2-FCP-NEXT:    vmovaps %ymm0, 512(%rax)
8248; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8249; AVX2-FCP-NEXT:    vmovaps %ymm0, 448(%rax)
8250; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8251; AVX2-FCP-NEXT:    vmovaps %ymm0, 384(%rax)
8252; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8253; AVX2-FCP-NEXT:    vmovaps %ymm0, 320(%rax)
8254; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8255; AVX2-FCP-NEXT:    vmovaps %ymm0, 256(%rax)
8256; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8257; AVX2-FCP-NEXT:    vmovaps %ymm0, 192(%rax)
8258; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8259; AVX2-FCP-NEXT:    vmovaps %ymm0, 128(%rax)
8260; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8261; AVX2-FCP-NEXT:    vmovaps %ymm0, 64(%rax)
8262; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8263; AVX2-FCP-NEXT:    vmovaps %ymm0, (%rax)
8264; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8265; AVX2-FCP-NEXT:    vmovaps %ymm0, 608(%rax)
8266; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8267; AVX2-FCP-NEXT:    vmovaps %ymm0, 416(%rax)
8268; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8269; AVX2-FCP-NEXT:    vmovaps %ymm0, 224(%rax)
8270; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8271; AVX2-FCP-NEXT:    vmovaps %ymm0, 32(%rax)
8272; AVX2-FCP-NEXT:    addq $1560, %rsp # imm = 0x618
8273; AVX2-FCP-NEXT:    vzeroupper
8274; AVX2-FCP-NEXT:    retq
8275;
8276; AVX512-LABEL: store_i16_stride6_vf64:
8277; AVX512:       # %bb.0:
8278; AVX512-NEXT:    subq $392, %rsp # imm = 0x188
8279; AVX512-NEXT:    vmovdqa 96(%rcx), %ymm9
8280; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
8281; AVX512-NEXT:    vmovdqa 96(%rdx), %ymm4
8282; AVX512-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
8283; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
8284; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
8285; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm9[4],ymm4[5],ymm9[5],ymm4[6],ymm9[6],ymm4[7],ymm9[7],ymm4[12],ymm9[12],ymm4[13],ymm9[13],ymm4[14],ymm9[14],ymm4[15],ymm9[15]
8286; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7]
8287; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
8288; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
8289; AVX512-NEXT:    vmovdqa 96(%rsi), %ymm10
8290; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm10[2,1,2,3,6,5,6,7]
8291; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
8292; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm6
8293; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm6[2,1,2,3,6,5,6,7]
8294; AVX512-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
8295; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11]
8296; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
8297; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm10[4],ymm6[5],ymm10[5],ymm6[6],ymm10[6],ymm6[7],ymm10[7],ymm6[12],ymm10[12],ymm6[13],ymm10[13],ymm6[14],ymm10[14],ymm6[15],ymm10[15]
8298; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3]
8299; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
8300; AVX512-NEXT:    movw $18724, %ax # imm = 0x4924
8301; AVX512-NEXT:    kmovw %eax, %k1
8302; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
8303; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
8304; AVX512-NEXT:    vmovdqa 96(%r8), %ymm3
8305; AVX512-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
8306; AVX512-NEXT:    vpshufb %ymm14, %ymm3, %ymm5
8307; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3]
8308; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7]
8309; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
8310; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
8311; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7]
8312; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
8313; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8314; AVX512-NEXT:    vmovdqa 64(%rcx), %ymm5
8315; AVX512-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8316; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
8317; AVX512-NEXT:    vmovdqa 64(%rdx), %ymm7
8318; AVX512-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8319; AVX512-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
8320; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
8321; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
8322; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15]
8323; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7]
8324; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
8325; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
8326; AVX512-NEXT:    vmovdqa 64(%rsi), %ymm7
8327; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm7[2,1,2,3,6,5,6,7]
8328; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
8329; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm2
8330; AVX512-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8331; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[2,1,2,3,6,5,6,7]
8332; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
8333; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11]
8334; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
8335; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm7[4],ymm2[5],ymm7[5],ymm2[6],ymm7[6],ymm2[7],ymm7[7],ymm2[12],ymm7[12],ymm2[13],ymm7[13],ymm2[14],ymm7[14],ymm2[15],ymm7[15]
8336; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm20
8337; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3]
8338; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
8339; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
8340; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
8341; AVX512-NEXT:    vmovdqa 64(%r8), %ymm12
8342; AVX512-NEXT:    vpshufb %ymm14, %ymm12, %ymm5
8343; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3]
8344; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7]
8345; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
8346; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
8347; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7]
8348; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
8349; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8350; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm5
8351; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
8352; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm7
8353; AVX512-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
8354; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
8355; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
8356; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15]
8357; AVX512-NEXT:    vmovdqa64 %ymm7, %ymm29
8358; AVX512-NEXT:    vmovdqa64 %ymm5, %ymm18
8359; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7]
8360; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm8
8361; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm8[2,1,2,3,6,5,6,7]
8362; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
8363; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm2
8364; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm2[2,1,2,3,6,5,6,7]
8365; AVX512-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
8366; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11]
8367; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
8368; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
8369; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm5[2,1,2,3]
8370; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm8[4],ymm2[5],ymm8[5],ymm2[6],ymm8[6],ymm2[7],ymm8[7],ymm2[12],ymm8[12],ymm2[13],ymm8[13],ymm2[14],ymm8[14],ymm2[15],ymm8[15]
8371; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm17
8372; AVX512-NEXT:    vmovdqa64 %ymm8, %ymm16
8373; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3]
8374; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
8375; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
8376; AVX512-NEXT:    vmovdqa 32(%r8), %ymm15
8377; AVX512-NEXT:    vpshufb %ymm14, %ymm15, %ymm0
8378; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
8379; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
8380; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6],ymm0[7]
8381; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm15[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
8382; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
8383; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7]
8384; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
8385; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8386; AVX512-NEXT:    vmovdqa (%rcx), %ymm8
8387; AVX512-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
8388; AVX512-NEXT:    vmovdqa (%rdx), %ymm13
8389; AVX512-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
8390; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
8391; AVX512-NEXT:    vpermq {{.*#+}} ymm19 = ymm0[2,2,2,2]
8392; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm13[4],ymm8[4],ymm13[5],ymm8[5],ymm13[6],ymm8[6],ymm13[7],ymm8[7],ymm13[12],ymm8[12],ymm13[13],ymm8[13],ymm13[14],ymm8[14],ymm13[15],ymm8[15]
8393; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm1[1,2,3,3,5,6,7,7]
8394; AVX512-NEXT:    vmovdqa (%rsi), %ymm2
8395; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[2,1,2,3,6,5,6,7]
8396; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
8397; AVX512-NEXT:    vmovdqa (%rdi), %ymm5
8398; AVX512-NEXT:    vpshufd {{.*#+}} ymm11 = ymm5[2,1,2,3,6,5,6,7]
8399; AVX512-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm11[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
8400; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[2],ymm0[2],ymm11[3],ymm0[3],ymm11[8],ymm0[8],ymm11[9],ymm0[9],ymm11[10],ymm0[10],ymm11[11],ymm0[11]
8401; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
8402; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm11 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15]
8403; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[3,3,3,3]
8404; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
8405; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm19, %zmm7
8406; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm0
8407; AVX512-NEXT:    vmovdqa32 %zmm7, %zmm0 {%k1}
8408; AVX512-NEXT:    vmovdqa (%r8), %ymm7
8409; AVX512-NEXT:    vpshufb %ymm14, %ymm7, %ymm11
8410; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3]
8411; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm14
8412; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3],ymm11[4],ymm14[5,6],ymm11[7]
8413; AVX512-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm7[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
8414; AVX512-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
8415; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0],ymm0[1,2],ymm14[3],ymm0[4,5],ymm14[6],ymm0[7]
8416; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm0
8417; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8418; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm8[0],ymm13[1],ymm8[1],ymm13[2],ymm8[2],ymm13[3],ymm8[3],ymm13[8],ymm8[8],ymm13[9],ymm8[9],ymm13[10],ymm8[10],ymm13[11],ymm8[11]
8419; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm23 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
8420; AVX512-NEXT:    vmovdqa (%rcx), %xmm13
8421; AVX512-NEXT:    vmovdqa (%rdx), %xmm14
8422; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
8423; AVX512-NEXT:    vpermt2d %zmm8, %zmm23, %zmm0
8424; AVX512-NEXT:    vmovdqa (%rsi), %xmm1
8425; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8426; AVX512-NEXT:    vmovdqa (%rdi), %xmm8
8427; AVX512-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8428; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
8429; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1]
8430; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11]
8431; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
8432; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm8, %zmm1
8433; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
8434; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
8435; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
8436; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
8437; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7]
8438; AVX512-NEXT:    vmovdqa (%r8), %xmm8
8439; AVX512-NEXT:    vmovdqa {{.*#+}} xmm11 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
8440; AVX512-NEXT:    vpshufb %xmm11, %xmm8, %xmm5
8441; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
8442; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7]
8443; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
8444; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8445; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[8],ymm9[8],ymm4[9],ymm9[9],ymm4[10],ymm9[10],ymm4[11],ymm9[11]
8446; AVX512-NEXT:    vmovdqa 96(%rcx), %xmm4
8447; AVX512-NEXT:    vmovdqa 96(%rdx), %xmm5
8448; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
8449; AVX512-NEXT:    vpermt2d %zmm0, %zmm23, %zmm7
8450; AVX512-NEXT:    vmovdqa 96(%rsi), %xmm0
8451; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm2
8452; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
8453; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1]
8454; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm10[0],ymm6[1],ymm10[1],ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[8],ymm10[8],ymm6[9],ymm10[9],ymm6[10],ymm10[10],ymm6[11],ymm10[11]
8455; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
8456; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm1, %zmm1
8457; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm7 {%k1}
8458; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
8459; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
8460; AVX512-NEXT:    vextracti64x4 $1, %zmm7, %ymm3
8461; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7]
8462; AVX512-NEXT:    vmovdqa 96(%r8), %xmm1
8463; AVX512-NEXT:    vpshufb %xmm11, %xmm1, %xmm6
8464; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
8465; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
8466; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm6, %zmm3
8467; AVX512-NEXT:    vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
8468; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
8469; AVX512-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8470; AVX512-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8471; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
8472; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm31 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
8473; AVX512-NEXT:    vpermt2d %zmm4, %zmm31, %zmm3
8474; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
8475; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5]
8476; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[0,1,2,1]
8477; AVX512-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5]
8478; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
8479; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
8480; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
8481; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[0,1,0,1]
8482; AVX512-NEXT:    movw $9362, %ax # imm = 0x2492
8483; AVX512-NEXT:    kmovw %eax, %k2
8484; AVX512-NEXT:    vmovdqa32 %zmm3, %zmm0 {%k2}
8485; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[2,1,3,3,4,5,6,7]
8486; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
8487; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
8488; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
8489; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
8490; AVX512-NEXT:    vpbroadcastq %xmm1, %ymm1
8491; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
8492; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm22
8493; AVX512-NEXT:    vmovdqa 64(%rcx), %xmm3
8494; AVX512-NEXT:    vmovdqa 64(%rdx), %xmm2
8495; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8496; AVX512-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8497; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8498; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
8499; AVX512-NEXT:    vpermt2d %zmm0, %zmm31, %zmm1
8500; AVX512-NEXT:    vmovdqa 64(%rsi), %xmm6
8501; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm6[0,1,2,1]
8502; AVX512-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,5]
8503; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm0
8504; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[0,1,2,1]
8505; AVX512-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5]
8506; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
8507; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
8508; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
8509; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[0,1,0,1]
8510; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm4 {%k2}
8511; AVX512-NEXT:    vmovdqa 64(%r8), %xmm1
8512; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm1[2,1,3,3,4,5,6,7]
8513; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
8514; AVX512-NEXT:    vextracti64x4 $1, %zmm4, %ymm7
8515; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7]
8516; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
8517; AVX512-NEXT:    vpbroadcastq %xmm7, %ymm7
8518; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7]
8519; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm28
8520; AVX512-NEXT:    vmovdqa 96(%r9), %ymm7
8521; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
8522; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
8523; AVX512-NEXT:    vpermq {{.*#+}} ymm24 = ymm5[2,2,2,3]
8524; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm7[2,3,2,3,6,7,6,7]
8525; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
8526; AVX512-NEXT:    vpermq {{.*#+}} ymm25 = ymm5[2,1,2,3]
8527; AVX512-NEXT:    vmovdqa 64(%r9), %ymm4
8528; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
8529; AVX512-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
8530; AVX512-NEXT:    vpermq {{.*#+}} ymm26 = ymm5[2,2,2,3]
8531; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm4[2,3,2,3,6,7,6,7]
8532; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm21
8533; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
8534; AVX512-NEXT:    vpermq {{.*#+}} ymm27 = ymm5[2,1,2,3]
8535; AVX512-NEXT:    vmovdqa 32(%r9), %ymm5
8536; AVX512-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
8537; AVX512-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
8538; AVX512-NEXT:    vpermq {{.*#+}} ymm30 = ymm9[2,2,2,3]
8539; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
8540; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
8541; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
8542; AVX512-NEXT:    # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11]
8543; AVX512-NEXT:    vpermt2d %zmm2, %zmm23, %zmm3
8544; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
8545; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
8546; AVX512-NEXT:    vmovdqa64 %ymm20, %ymm6
8547; AVX512-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
8548; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11]
8549; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
8550; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
8551; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
8552; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
8553; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
8554; AVX512-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
8555; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
8556; AVX512-NEXT:    vmovdqa %xmm11, %xmm4
8557; AVX512-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
8558; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
8559; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7]
8560; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm20
8561; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm0
8562; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm1
8563; AVX512-NEXT:    vmovdqa64 %ymm29, %ymm2
8564; AVX512-NEXT:    vmovdqa64 %ymm18, %ymm3
8565; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
8566; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
8567; AVX512-NEXT:    vpermt2d %zmm2, %zmm23, %zmm9
8568; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm2
8569; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm3
8570; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
8571; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[1,1,1,1]
8572; AVX512-NEXT:    vmovdqa64 %ymm16, %ymm6
8573; AVX512-NEXT:    vmovdqa64 %ymm17, %ymm11
8574; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[8],ymm6[8],ymm11[9],ymm6[9],ymm11[10],ymm6[10],ymm11[11],ymm6[11]
8575; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3]
8576; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm10, %zmm11
8577; AVX512-NEXT:    vpshufd {{.*#+}} ymm10 = ymm5[2,3,2,3,6,7,6,7]
8578; AVX512-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
8579; AVX512-NEXT:    vpermq {{.*#+}} ymm17 = ymm10[2,1,2,3]
8580; AVX512-NEXT:    vmovdqa32 %zmm11, %zmm9 {%k1}
8581; AVX512-NEXT:    vextracti64x4 $1, %zmm9, %ymm11
8582; AVX512-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
8583; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2]
8584; AVX512-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7]
8585; AVX512-NEXT:    vmovdqa 32(%r8), %xmm15
8586; AVX512-NEXT:    vpshufb %xmm4, %xmm15, %xmm12
8587; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
8588; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5,6],ymm12[7]
8589; AVX512-NEXT:    vinserti64x4 $1, %ymm11, %zmm9, %zmm19
8590; AVX512-NEXT:    vmovdqa (%r9), %ymm11
8591; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8592; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8593; AVX512-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8594; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8595; AVX512-NEXT:    vpermt2d %zmm0, %zmm31, %zmm12
8596; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[0,1,2,1]
8597; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
8598; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[0,1,2,1]
8599; AVX512-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
8600; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
8601; AVX512-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
8602; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
8603; AVX512-NEXT:    vpermq {{.*#+}} ymm18 = ymm0[2,2,2,3]
8604; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8605; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm11[2,3,2,3,6,7,6,7]
8606; AVX512-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
8607; AVX512-NEXT:    vpermq {{.*#+}} ymm16 = ymm2[2,1,2,3]
8608; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
8609; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[0,1,0,1]
8610; AVX512-NEXT:    vmovdqa32 %zmm12, %zmm1 {%k2}
8611; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
8612; AVX512-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm15[2,1,3,3,4,5,6,7]
8613; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1]
8614; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm12[0],ymm3[1,2],ymm12[3],ymm3[4,5],ymm12[6],ymm3[7]
8615; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm12 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero
8616; AVX512-NEXT:    vpbroadcastq %xmm12, %ymm12
8617; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm12[2],ymm1[3,4],ymm12[5],ymm1[6,7]
8618; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm15
8619; AVX512-NEXT:    vmovdqa (%r9), %xmm3
8620; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
8621; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,1,4,5,6,7]
8622; AVX512-NEXT:    vpermq {{.*#+}} ymm23 = ymm1[0,1,0,1]
8623; AVX512-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
8624; AVX512-NEXT:    vpermq {{.*#+}} ymm29 = ymm1[2,2,2,2]
8625; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
8626; AVX512-NEXT:    vpsrldq {{.*#+}} xmm11 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8627; AVX512-NEXT:    vpsrldq {{.*#+}} xmm12 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8628; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
8629; AVX512-NEXT:    vpermt2d %zmm11, %zmm31, %zmm1
8630; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8631; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm0[0,1,2,1]
8632; AVX512-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5]
8633; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8634; AVX512-NEXT:    vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1]
8635; AVX512-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5]
8636; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
8637; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
8638; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1]
8639; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[0,1,0,1]
8640; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm11 {%k2}
8641; AVX512-NEXT:    vextracti64x4 $1, %zmm11, %ymm1
8642; AVX512-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm8[2,1,3,3,4,5,6,7]
8643; AVX512-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1]
8644; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm12[0],ymm1[1,2],ymm12[3],ymm1[4,5],ymm12[6],ymm1[7]
8645; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
8646; AVX512-NEXT:    vpbroadcastq %xmm8, %ymm8
8647; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7]
8648; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm8, %zmm1
8649; AVX512-NEXT:    vmovdqa 96(%r9), %xmm8
8650; AVX512-NEXT:    vpshufd {{.*#+}} xmm11 = xmm8[2,3,2,3]
8651; AVX512-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,1,4,5,6,7]
8652; AVX512-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1]
8653; AVX512-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
8654; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
8655; AVX512-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm8[0,0,2,1,4,5,6,7]
8656; AVX512-NEXT:    vpbroadcastq %xmm12, %ymm12
8657; AVX512-NEXT:    vmovdqa 64(%r9), %xmm13
8658; AVX512-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
8659; AVX512-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4]
8660; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
8661; AVX512-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm13[0,0,2,1,4,5,6,7]
8662; AVX512-NEXT:    vpbroadcastq %xmm14, %ymm14
8663; AVX512-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm13[0,2,2,3,4,5,6,7]
8664; AVX512-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
8665; AVX512-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
8666; AVX512-NEXT:    vinserti64x4 $1, %ymm25, %zmm24, %zmm24
8667; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
8668; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm24 # 64-byte Folded Reload
8669; AVX512-NEXT:    # zmm24 = zmm24 ^ (zmm25 & (zmm24 ^ mem))
8670; AVX512-NEXT:    vinserti64x4 $1, %ymm27, %zmm26, %zmm26
8671; AVX512-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3]
8672; AVX512-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,1,4,5,6,7]
8673; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1]
8674; AVX512-NEXT:    vmovdqa 32(%r9), %xmm0
8675; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm2
8676; AVX512-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm2[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
8677; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2]
8678; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm0[2,3,2,3]
8679; AVX512-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,1,4,5,6,7]
8680; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1]
8681; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm26 # 64-byte Folded Reload
8682; AVX512-NEXT:    # zmm26 = zmm26 ^ (zmm25 & (zmm26 ^ mem))
8683; AVX512-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
8684; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2]
8685; AVX512-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm0[0,0,2,1,4,5,6,7]
8686; AVX512-NEXT:    vpbroadcastq %xmm9, %ymm9
8687; AVX512-NEXT:    vinserti64x4 $1, %ymm17, %zmm30, %zmm17
8688; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
8689; AVX512-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
8690; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
8691; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7]
8692; AVX512-NEXT:    vpbroadcastq %xmm2, %ymm2
8693; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
8694; AVX512-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
8695; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
8696; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm17 # 64-byte Folded Reload
8697; AVX512-NEXT:    # zmm17 = zmm17 ^ (zmm25 & (zmm17 ^ mem))
8698; AVX512-NEXT:    vinserti64x4 $1, %ymm16, %zmm18, %zmm16
8699; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm16 # 64-byte Folded Reload
8700; AVX512-NEXT:    # zmm16 = zmm16 ^ (zmm25 & (zmm16 ^ mem))
8701; AVX512-NEXT:    vinserti64x4 $1, %ymm29, %zmm23, %zmm18
8702; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
8703; AVX512-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm18 # 64-byte Folded Reload
8704; AVX512-NEXT:    # zmm18 = zmm18 ^ (zmm23 & (zmm18 ^ mem))
8705; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm11, %zmm4
8706; AVX512-NEXT:    vpternlogd $184, (%rsp), %zmm23, %zmm4 # 64-byte Folded Reload
8707; AVX512-NEXT:    # zmm4 = zmm4 ^ (zmm23 & (zmm4 ^ mem))
8708; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm13, %zmm7
8709; AVX512-NEXT:    vpternlogd {{.*#+}} zmm7 = zmm7 ^ (zmm23 & (zmm7 ^ zmm20))
8710; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm10, %zmm5
8711; AVX512-NEXT:    vpternlogd {{.*#+}} zmm5 = zmm5 ^ (zmm23 & (zmm5 ^ zmm19))
8712; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm12, %zmm8
8713; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
8714; AVX512-NEXT:    vpternlogd {{.*#+}} zmm8 = zmm8 ^ (zmm10 & (zmm8 ^ zmm22))
8715; AVX512-NEXT:    vinserti64x4 $1, %ymm6, %zmm14, %zmm6
8716; AVX512-NEXT:    vpternlogd {{.*#+}} zmm6 = zmm6 ^ (zmm10 & (zmm6 ^ zmm28))
8717; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm9, %zmm0
8718; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (zmm10 & (zmm0 ^ zmm15))
8719; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
8720; AVX512-NEXT:    vpternlogd {{.*#+}} zmm2 = zmm2 ^ (zmm10 & (zmm2 ^ zmm1))
8721; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
8722; AVX512-NEXT:    vmovdqa64 %zmm2, (%rax)
8723; AVX512-NEXT:    vmovdqa64 %zmm0, 192(%rax)
8724; AVX512-NEXT:    vmovdqa64 %zmm5, 256(%rax)
8725; AVX512-NEXT:    vmovdqa64 %zmm7, 448(%rax)
8726; AVX512-NEXT:    vmovdqa64 %zmm6, 384(%rax)
8727; AVX512-NEXT:    vmovdqa64 %zmm8, 576(%rax)
8728; AVX512-NEXT:    vmovdqa64 %zmm4, 640(%rax)
8729; AVX512-NEXT:    vmovdqa64 %zmm18, 64(%rax)
8730; AVX512-NEXT:    vmovdqa64 %zmm16, 128(%rax)
8731; AVX512-NEXT:    vmovdqa64 %zmm17, 320(%rax)
8732; AVX512-NEXT:    vmovdqa64 %zmm26, 512(%rax)
8733; AVX512-NEXT:    vmovdqa64 %zmm24, 704(%rax)
8734; AVX512-NEXT:    addq $392, %rsp # imm = 0x188
8735; AVX512-NEXT:    vzeroupper
8736; AVX512-NEXT:    retq
8737;
8738; AVX512-FCP-LABEL: store_i16_stride6_vf64:
8739; AVX512-FCP:       # %bb.0:
8740; AVX512-FCP-NEXT:    subq $1240, %rsp # imm = 0x4D8
8741; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %ymm1
8742; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %ymm2
8743; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %ymm6
8744; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %ymm5
8745; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %xmm9
8746; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %xmm10
8747; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm7
8748; AVX512-FCP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8749; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm0
8750; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8751; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %xmm13
8752; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm8
8753; AVX512-FCP-NEXT:    vmovdqa %xmm8, (%rsp) # 16-byte Spill
8754; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm12
8755; AVX512-FCP-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8756; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %xmm11
8757; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm4
8758; AVX512-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8759; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm3
8760; AVX512-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8761; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
8762; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
8763; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
8764; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8765; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
8766; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
8767; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
8768; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8769; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11]
8770; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7]
8771; AVX512-FCP-NEXT:    vmovdqa64 %xmm13, %xmm18
8772; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
8773; AVX512-FCP-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8774; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm3
8775; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm7
8776; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[8],ymm3[8],ymm7[9],ymm3[9],ymm7[10],ymm3[10],ymm7[11],ymm3[11]
8777; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
8778; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm8, %zmm4
8779; AVX512-FCP-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8780; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %ymm12
8781; AVX512-FCP-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8782; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
8783; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm12, %ymm4
8784; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm0
8785; AVX512-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8786; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm0, %ymm8
8787; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11]
8788; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [2,1,2,3,11,11,11,11]
8789; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm12[4],ymm0[5],ymm12[5],ymm0[6],ymm12[6],ymm0[7],ymm12[7],ymm0[12],ymm12[12],ymm0[13],ymm12[13],ymm0[14],ymm12[14],ymm0[15],ymm12[15]
8790; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm27, %zmm8
8791; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm4 = [5,6,5,6,5,6,7,7]
8792; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm12 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
8793; AVX512-FCP-NEXT:    vpermd %ymm12, %ymm4, %ymm12
8794; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
8795; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
8796; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
8797; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
8798; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm0, %zmm0
8799; AVX512-FCP-NEXT:    movw $18724, %ax # imm = 0x4924
8800; AVX512-FCP-NEXT:    kmovw %eax, %k1
8801; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm8 {%k1}
8802; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm25 = [12,1,2,13,4,5,14,7]
8803; AVX512-FCP-NEXT:    vmovdqa %ymm8, %ymm0
8804; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %ymm2
8805; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8806; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
8807; AVX512-FCP-NEXT:    vpermt2d %ymm1, %ymm25, %ymm0
8808; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm28 = [8,21,10,11,20,13,14,23]
8809; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
8810; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm1
8811; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm28, %zmm8
8812; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
8813; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8814; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm29 = [2,2,0,3,10,0,10,11]
8815; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %ymm1
8816; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8817; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
8818; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm1, %ymm0
8819; AVX512-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
8820; AVX512-FCP-NEXT:    # ymm14 = mem[0,1,0,1]
8821; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm1, %ymm1
8822; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm29, %zmm1
8823; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8824; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %ymm2
8825; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8826; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm0
8827; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm8
8828; AVX512-FCP-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8829; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm8, %ymm1
8830; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
8831; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15]
8832; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm27, %zmm0
8833; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15]
8834; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm4, %ymm1
8835; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm6 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
8836; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm5 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
8837; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11]
8838; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2]
8839; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm1
8840; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
8841; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm1
8842; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %ymm2
8843; AVX512-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8844; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
8845; AVX512-FCP-NEXT:    vpermt2d %ymm5, %ymm25, %ymm1
8846; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm2, %ymm5
8847; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm28, %zmm0
8848; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
8849; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8850; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %ymm1
8851; AVX512-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8852; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm1, %ymm0
8853; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm1, %ymm1
8854; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm29, %zmm1
8855; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8856; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %xmm2
8857; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8858; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
8859; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm2, %xmm0
8860; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm5
8861; AVX512-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8862; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm1
8863; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
8864; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm16 = [0,0,2,1,8,9,8,9]
8865; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
8866; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm16, %zmm1
8867; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm30 = [1,0,2,2,1,0,2,2]
8868; AVX512-FCP-NEXT:    # ymm30 = mem[0,1,2,3,0,1,2,3]
8869; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
8870; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm30, %ymm0
8871; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8872; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8873; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
8874; AVX512-FCP-NEXT:    vpbroadcastq %xmm2, %ymm2
8875; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
8876; AVX512-FCP-NEXT:    movw $9362, %ax # imm = 0x2492
8877; AVX512-FCP-NEXT:    kmovw %eax, %k2
8878; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k2}
8879; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm0
8880; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [16,9,10,17,12,13,18,15]
8881; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %xmm5
8882; AVX512-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8883; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7]
8884; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm17, %zmm1
8885; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
8886; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm26 = [0,1,8,3,4,9,6,7]
8887; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm26, %ymm0
8888; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
8889; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8890; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm31 = [0,0,0,0,8,8,0,9]
8891; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %xmm1
8892; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8893; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
8894; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm0
8895; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7]
8896; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm31, %zmm1
8897; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8898; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %xmm2
8899; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm2, %xmm0
8900; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm5
8901; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm5, %xmm1
8902; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
8903; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
8904; AVX512-FCP-NEXT:    vmovdqa64 %xmm5, %xmm23
8905; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm24
8906; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm16, %zmm1
8907; AVX512-FCP-NEXT:    vmovdqa64 %xmm18, %xmm2
8908; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3]
8909; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm30, %ymm0
8910; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8911; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8912; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
8913; AVX512-FCP-NEXT:    vpbroadcastq %xmm2, %ymm2
8914; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
8915; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k2}
8916; AVX512-FCP-NEXT:    vmovdqa %ymm1, %ymm0
8917; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %xmm5
8918; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7]
8919; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm17, %zmm1
8920; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
8921; AVX512-FCP-NEXT:    vmovdqa64 %xmm5, %xmm19
8922; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm26, %ymm0
8923; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
8924; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8925; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %xmm1
8926; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8927; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm0
8928; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7]
8929; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm31, %zmm1
8930; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8931; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm2
8932; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm0
8933; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
8934; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm5, %ymm1
8935; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
8936; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15]
8937; AVX512-FCP-NEXT:    vmovdqa64 %ymm5, %ymm21
8938; AVX512-FCP-NEXT:    vmovdqa64 %ymm2, %ymm20
8939; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm27, %zmm0
8940; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm7[4],ymm3[4],ymm7[5],ymm3[5],ymm7[6],ymm3[6],ymm7[7],ymm3[7],ymm7[12],ymm3[12],ymm7[13],ymm3[13],ymm7[14],ymm3[14],ymm7[15],ymm3[15]
8941; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm4, %ymm1
8942; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
8943; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
8944; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
8945; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
8946; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
8947; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
8948; AVX512-FCP-NEXT:    vmovdqa %ymm0, %ymm1
8949; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm3
8950; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
8951; AVX512-FCP-NEXT:    vpermt2d %ymm2, %ymm25, %ymm1
8952; AVX512-FCP-NEXT:    vmovdqa %ymm12, %ymm5
8953; AVX512-FCP-NEXT:    vpshufb %ymm12, %ymm3, %ymm2
8954; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm18
8955; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm28, %zmm0
8956; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
8957; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8958; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm12
8959; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm12, %ymm0
8960; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm11
8961; AVX512-FCP-NEXT:    vpshufb %ymm13, %ymm11, %ymm1
8962; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
8963; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15]
8964; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm27, %zmm0
8965; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
8966; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm3, %ymm1
8967; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm3, %ymm2
8968; AVX512-FCP-NEXT:    vmovdqa64 %ymm3, %ymm22
8969; AVX512-FCP-NEXT:    vpermt2q %zmm1, %zmm29, %zmm2
8970; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8971; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
8972; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
8973; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
8974; AVX512-FCP-NEXT:    vpermd %ymm1, %ymm4, %ymm1
8975; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
8976; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
8977; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
8978; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
8979; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
8980; AVX512-FCP-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
8981; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm10
8982; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
8983; AVX512-FCP-NEXT:    vpermi2d %ymm1, %ymm0, %ymm25
8984; AVX512-FCP-NEXT:    vpshufb %ymm5, %ymm10, %ymm1
8985; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm28, %zmm0
8986; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm25, %zmm25
8987; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm7
8988; AVX512-FCP-NEXT:    vpshufb %ymm15, %ymm7, %ymm0
8989; AVX512-FCP-NEXT:    vpshufb %ymm14, %ymm7, %ymm13
8990; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm29, %zmm13
8991; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm15
8992; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm14
8993; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm15, %xmm0
8994; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm14, %xmm1
8995; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
8996; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
8997; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm16, %zmm4
8998; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8999; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9000; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
9001; AVX512-FCP-NEXT:    vpermd %ymm0, %ymm30, %ymm5
9002; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm6
9003; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm3
9004; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm9
9005; AVX512-FCP-NEXT:    vpshufb %xmm8, %xmm3, %xmm8
9006; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
9007; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
9008; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm16, %zmm0
9009; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm8 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9010; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm9 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9011; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
9012; AVX512-FCP-NEXT:    vpbroadcastq %xmm8, %ymm8
9013; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm5, %zmm5
9014; AVX512-FCP-NEXT:    vmovdqa32 %zmm5, %zmm4 {%k2}
9015; AVX512-FCP-NEXT:    vmovdqa %ymm4, %ymm5
9016; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm8
9017; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm8[2,1,3,3,4,5,6,7]
9018; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm17, %zmm4
9019; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
9020; AVX512-FCP-NEXT:    vpermt2d %ymm9, %ymm26, %ymm5
9021; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm16
9022; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9023; AVX512-FCP-NEXT:    vmovdqa (%rsp), %xmm2 # 16-byte Reload
9024; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
9025; AVX512-FCP-NEXT:    vpermd %ymm4, %ymm30, %ymm4
9026; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9027; AVX512-FCP-NEXT:    vpsrldq {{.*#+}} xmm9 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9028; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
9029; AVX512-FCP-NEXT:    vpbroadcastq %xmm5, %ymm5
9030; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
9031; AVX512-FCP-NEXT:    vmovdqa32 %zmm4, %zmm0 {%k2}
9032; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm1
9033; AVX512-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
9034; AVX512-FCP-NEXT:    vpermi2d %ymm4, %ymm0, %ymm26
9035; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm1[2,1,3,3,4,5,6,7]
9036; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm17, %zmm0
9037; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm9
9038; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
9039; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm9, %xmm4
9040; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm9[0,0,2,1,4,5,6,7]
9041; AVX512-FCP-NEXT:    vpermt2q %zmm4, %zmm31, %zmm5
9042; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm26, %zmm17
9043; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm0
9044; AVX512-FCP-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
9045; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm0[0,0,2,1,4,5,6,7]
9046; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm31, %zmm4
9047; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11]
9048; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
9049; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm12 = [1,1,1,1,10,10,10,11]
9050; AVX512-FCP-NEXT:    vpermt2q %zmm2, %zmm12, %zmm3
9051; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14]
9052; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm2 # 64-byte Folded Reload
9053; AVX512-FCP-NEXT:    vmovdqa32 %zmm3, %zmm2 {%k1}
9054; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm11 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
9055; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm1, %xmm3
9056; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm27 = [0,9,2,3,8,5,6,11]
9057; AVX512-FCP-NEXT:    vmovdqa %ymm2, %ymm1
9058; AVX512-FCP-NEXT:    vpermt2d %ymm3, %ymm27, %ymm1
9059; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
9060; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm30 = [8,9,20,11,12,21,14,15]
9061; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm30, %zmm2
9062; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
9063; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
9064; AVX512-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm3
9065; AVX512-FCP-NEXT:    vmovdqa64 %xmm6, %xmm31
9066; AVX512-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm28 = [0,0,0,1,0,10,10,0]
9067; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm28, %zmm3
9068; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9069; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
9070; AVX512-FCP-NEXT:    # ymm7 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11]
9071; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9072; AVX512-FCP-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9073; AVX512-FCP-NEXT:    # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9074; AVX512-FCP-NEXT:    vpermt2q %zmm7, %zmm12, %zmm0
9075; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm7 # 64-byte Folded Reload
9076; AVX512-FCP-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k1}
9077; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9078; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
9079; AVX512-FCP-NEXT:    vmovdqa64 %ymm7, %ymm29
9080; AVX512-FCP-NEXT:    vpermt2d %ymm0, %ymm27, %ymm29
9081; AVX512-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
9082; AVX512-FCP-NEXT:    # ymm0 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
9083; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm30, %zmm7
9084; AVX512-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9085; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9086; AVX512-FCP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11]
9087; AVX512-FCP-NEXT:    vmovdqa64 %xmm23, %xmm10
9088; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm6
9089; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
9090; AVX512-FCP-NEXT:    vpermt2q %zmm0, %zmm12, %zmm10
9091; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm0 # 64-byte Folded Reload
9092; AVX512-FCP-NEXT:    vmovdqa32 %zmm10, %zmm0 {%k1}
9093; AVX512-FCP-NEXT:    vmovdqa64 %xmm19, %xmm6
9094; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm6, %xmm10
9095; AVX512-FCP-NEXT:    vmovdqa64 %ymm0, %ymm19
9096; AVX512-FCP-NEXT:    vpermt2d %ymm10, %ymm27, %ymm19
9097; AVX512-FCP-NEXT:    vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
9098; AVX512-FCP-NEXT:    # ymm10 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
9099; AVX512-FCP-NEXT:    vpermt2d %zmm10, %zmm30, %zmm0
9100; AVX512-FCP-NEXT:    vmovdqa64 %ymm21, %ymm6
9101; AVX512-FCP-NEXT:    vmovdqa64 %ymm20, %ymm10
9102; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm6[0],ymm10[0],ymm6[1],ymm10[1],ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[8],ymm10[8],ymm6[9],ymm10[9],ymm6[10],ymm10[10],ymm6[11],ymm10[11]
9103; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
9104; AVX512-FCP-NEXT:    vpermt2q %zmm10, %zmm12, %zmm14
9105; AVX512-FCP-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm10 # 64-byte Folded Reload
9106; AVX512-FCP-NEXT:    vmovdqa32 %zmm14, %zmm10 {%k1}
9107; AVX512-FCP-NEXT:    vpshufb %xmm11, %xmm8, %xmm8
9108; AVX512-FCP-NEXT:    vpermi2d %ymm8, %ymm10, %ymm27
9109; AVX512-FCP-NEXT:    vmovdqa64 %ymm18, %ymm6
9110; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
9111; AVX512-FCP-NEXT:    vpermt2d %zmm8, %zmm30, %zmm10
9112; AVX512-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
9113; AVX512-FCP-NEXT:    # ymm8 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
9114; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9115; AVX512-FCP-NEXT:    vmovdqa64 %xmm31, %xmm14
9116; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
9117; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm28, %zmm11
9118; AVX512-FCP-NEXT:    vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
9119; AVX512-FCP-NEXT:    # ymm8 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
9120; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9121; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm6, %xmm12
9122; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm28, %zmm12
9123; AVX512-FCP-NEXT:    vpshufb %xmm14, %xmm9, %xmm6
9124; AVX512-FCP-NEXT:    vmovdqa64 %ymm22, %ymm8
9125; AVX512-FCP-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
9126; AVX512-FCP-NEXT:    vpermt2q %zmm8, %zmm28, %zmm6
9127; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm27, %zmm8
9128; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
9129; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm6 = zmm6 ^ (zmm9 & (zmm6 ^ zmm8))
9130; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
9131; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 256(%rax)
9132; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm19, %zmm0
9133; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm12 = zmm12 ^ (zmm9 & (zmm12 ^ zmm0))
9134; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, 448(%rax)
9135; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm29, %zmm0
9136; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm11 = zmm11 ^ (zmm9 & (zmm11 ^ zmm0))
9137; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, 640(%rax)
9138; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm0
9139; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (zmm9 & (zmm3 ^ zmm0))
9140; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 64(%rax)
9141; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
9142; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm4 ^ (zmm0 & (zmm4 ^ zmm17))
9143; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
9144; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm16))
9145; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 192(%rax)
9146; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
9147; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} zmm13 = zmm13 ^ (zmm1 & (zmm13 ^ zmm25))
9148; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, 128(%rax)
9149; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9150; AVX512-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload
9151; AVX512-FCP-NEXT:    # zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ mem))
9152; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 320(%rax)
9153; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9154; AVX512-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
9155; AVX512-FCP-NEXT:    # zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ mem))
9156; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 384(%rax)
9157; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9158; AVX512-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
9159; AVX512-FCP-NEXT:    # zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ mem))
9160; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 576(%rax)
9161; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9162; AVX512-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
9163; AVX512-FCP-NEXT:    # zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ mem))
9164; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 512(%rax)
9165; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9166; AVX512-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
9167; AVX512-FCP-NEXT:    # zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ mem))
9168; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 704(%rax)
9169; AVX512-FCP-NEXT:    addq $1240, %rsp # imm = 0x4D8
9170; AVX512-FCP-NEXT:    vzeroupper
9171; AVX512-FCP-NEXT:    retq
9172;
9173; AVX512DQ-LABEL: store_i16_stride6_vf64:
9174; AVX512DQ:       # %bb.0:
9175; AVX512DQ-NEXT:    subq $584, %rsp # imm = 0x248
9176; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %xmm0
9177; AVX512DQ-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
9178; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %xmm2
9179; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9180; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
9181; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9182; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9183; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
9184; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
9185; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
9186; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %xmm4
9187; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9188; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[0,1,2,1]
9189; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
9190; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm5
9191; AVX512DQ-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9192; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm5[0,1,2,1]
9193; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
9194; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
9195; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
9196; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
9197; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,0,1]
9198; AVX512DQ-NEXT:    movw $9362, %ax # imm = 0x2492
9199; AVX512DQ-NEXT:    kmovw %eax, %k1
9200; AVX512DQ-NEXT:    vmovdqa32 %zmm1, %zmm2 {%k1}
9201; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm1
9202; AVX512DQ-NEXT:    vmovdqa 96(%r8), %xmm4
9203; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9204; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7]
9205; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
9206; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7]
9207; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
9208; AVX512DQ-NEXT:    vpbroadcastq %xmm3, %ymm3
9209; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
9210; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
9211; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9212; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm5
9213; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %xmm2
9214; AVX512DQ-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9215; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm6
9216; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %xmm3
9217; AVX512DQ-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9218; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
9219; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9220; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9221; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
9222; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
9223; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm4
9224; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9225; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %xmm7
9226; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm7[0,1,2,1]
9227; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
9228; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm8
9229; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm8[0,1,2,1]
9230; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
9231; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
9232; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
9233; AVX512DQ-NEXT:    vmovdqa64 %xmm8, %xmm29
9234; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm30
9235; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
9236; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,0,1]
9237; AVX512DQ-NEXT:    vmovdqa32 %zmm1, %zmm2 {%k1}
9238; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm1
9239; AVX512DQ-NEXT:    vmovdqa 64(%r8), %xmm7
9240; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm7[2,1,3,3,4,5,6,7]
9241; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
9242; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7]
9243; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
9244; AVX512DQ-NEXT:    vmovdqa64 %xmm7, %xmm31
9245; AVX512DQ-NEXT:    vpbroadcastq %xmm3, %ymm3
9246; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
9247; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm7
9248; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
9249; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9250; AVX512DQ-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9251; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9252; AVX512DQ-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9253; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9254; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
9255; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
9256; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm0, %zmm2
9257; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm4[0,1,2,1]
9258; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
9259; AVX512DQ-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9260; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm3 = xmm7[0,1,2,1]
9261; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
9262; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
9263; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
9264; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
9265; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[0,1,0,1]
9266; AVX512DQ-NEXT:    vmovdqa32 %zmm2, %zmm1 {%k1}
9267; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm4
9268; AVX512DQ-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9269; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm4[2,1,3,3,4,5,6,7]
9270; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
9271; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
9272; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
9273; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
9274; AVX512DQ-NEXT:    vpbroadcastq %xmm3, %ymm3
9275; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
9276; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
9277; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9278; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm3
9279; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm4
9280; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9281; AVX512DQ-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9282; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
9283; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
9284; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm27
9285; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm28
9286; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm0, %zmm2
9287; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm3
9288; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm3[0,1,2,1]
9289; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
9290; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm4
9291; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm4[0,1,2,1]
9292; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
9293; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
9294; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
9295; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm20
9296; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm21
9297; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
9298; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,0,1]
9299; AVX512DQ-NEXT:    vmovdqa32 %zmm2, %zmm0 {%k1}
9300; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm3
9301; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm3[2,1,3,3,4,5,6,7]
9302; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
9303; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
9304; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
9305; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
9306; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm18
9307; AVX512DQ-NEXT:    vpbroadcastq %xmm2, %ymm2
9308; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
9309; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
9310; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9311; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %ymm2
9312; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm2[2,1,2,3,6,5,6,7]
9313; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
9314; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %ymm3
9315; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7]
9316; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
9317; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
9318; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
9319; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
9320; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm22
9321; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm26
9322; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
9323; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %ymm12
9324; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
9325; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %ymm9
9326; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
9327; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
9328; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
9329; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm12[4],ymm9[5],ymm12[5],ymm9[6],ymm12[6],ymm9[7],ymm12[7],ymm9[12],ymm12[12],ymm9[13],ymm12[13],ymm9[14],ymm12[14],ymm9[15],ymm12[15]
9330; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7]
9331; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
9332; AVX512DQ-NEXT:    movw $18724, %ax # imm = 0x4924
9333; AVX512DQ-NEXT:    kmovw %eax, %k1
9334; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
9335; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1}
9336; AVX512DQ-NEXT:    vmovdqa 96(%r8), %ymm6
9337; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
9338; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm6, %ymm1
9339; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
9340; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
9341; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
9342; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
9343; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
9344; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7]
9345; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
9346; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9347; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %ymm2
9348; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm2[2,1,2,3,6,5,6,7]
9349; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
9350; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm3
9351; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7]
9352; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
9353; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
9354; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
9355; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
9356; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm16
9357; AVX512DQ-NEXT:    vmovdqa64 %ymm2, %ymm17
9358; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
9359; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %ymm7
9360; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
9361; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %ymm5
9362; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
9363; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
9364; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
9365; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15]
9366; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7]
9367; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
9368; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
9369; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1}
9370; AVX512DQ-NEXT:    vmovdqa 64(%r8), %ymm8
9371; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm8, %ymm1
9372; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
9373; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
9374; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
9375; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
9376; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
9377; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7]
9378; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
9379; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9380; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm2
9381; AVX512DQ-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9382; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm2[2,1,2,3,6,5,6,7]
9383; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
9384; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm3
9385; AVX512DQ-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9386; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7]
9387; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
9388; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
9389; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
9390; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
9391; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
9392; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm11
9393; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm2 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
9394; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm10
9395; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
9396; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
9397; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
9398; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15]
9399; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7]
9400; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
9401; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
9402; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1}
9403; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm3
9404; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm3, %ymm1
9405; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
9406; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
9407; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
9408; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
9409; AVX512DQ-NEXT:    vmovdqa64 %ymm3, %ymm19
9410; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
9411; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7]
9412; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
9413; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9414; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm4
9415; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm4[2,1,2,3,6,5,6,7]
9416; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
9417; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm3
9418; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7]
9419; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
9420; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
9421; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm24 = ymm0[2,1,2,3]
9422; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
9423; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm23 = ymm0[3,3,3,3]
9424; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm2
9425; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
9426; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm0
9427; AVX512DQ-NEXT:    vpsrldq {{.*#+}} ymm15 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
9428; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[1],ymm1[1],ymm15[2],ymm1[2],ymm15[3],ymm1[3],ymm15[8],ymm1[8],ymm15[9],ymm1[9],ymm15[10],ymm1[10],ymm15[11],ymm1[11]
9429; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm25 = ymm1[2,2,2,2]
9430; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm15 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
9431; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[1,2,3,3,5,6,7,7]
9432; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3]
9433; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm23, %zmm24, %zmm1
9434; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm15, %zmm25, %zmm1 {%k1}
9435; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm15
9436; AVX512DQ-NEXT:    vpshufb %ymm13, %ymm15, %ymm14
9437; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
9438; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm13
9439; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6],ymm14[7]
9440; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm14 = ymm15[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
9441; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
9442; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0],ymm1[1,2],ymm14[3],ymm1[4,5],ymm14[6],ymm1[7]
9443; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm13, %zmm1, %zmm23
9444; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm1
9445; AVX512DQ-NEXT:    vmovdqa64 %xmm28, %xmm13
9446; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7]
9447; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
9448; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm24 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
9449; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm24, %zmm0
9450; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm1
9451; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm2
9452; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
9453; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1]
9454; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
9455; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
9456; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm2, %zmm1, %zmm0 {%k1}
9457; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
9458; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
9459; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
9460; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
9461; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm4 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
9462; AVX512DQ-NEXT:    vmovdqa64 %xmm18, %xmm2
9463; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
9464; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
9465; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
9466; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm25
9467; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9468; AVX512DQ-NEXT:    vpunpckhwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
9469; AVX512DQ-NEXT:    # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9470; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[8],ymm12[8],ymm9[9],ymm12[9],ymm9[10],ymm12[10],ymm9[11],ymm12[11]
9471; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm24, %zmm1
9472; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9473; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9474; AVX512DQ-NEXT:    # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9475; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
9476; AVX512DQ-NEXT:    vmovdqa64 %ymm22, %ymm2
9477; AVX512DQ-NEXT:    vmovdqa64 %ymm26, %ymm3
9478; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
9479; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
9480; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1}
9481; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
9482; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
9483; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
9484; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
9485; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9486; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
9487; AVX512DQ-NEXT:    vmovdqa %xmm4, %xmm9
9488; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
9489; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
9490; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm26
9491; AVX512DQ-NEXT:    vmovdqa 96(%r9), %xmm6
9492; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm6[0,0,2,1,4,5,6,7]
9493; AVX512DQ-NEXT:    vpbroadcastq %xmm0, %ymm0
9494; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm6[0,2,2,3,4,5,6,7]
9495; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
9496; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
9497; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9498; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
9499; AVX512DQ-NEXT:    # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
9500; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11]
9501; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm24, %zmm3
9502; AVX512DQ-NEXT:    vmovdqa64 %xmm29, %xmm2
9503; AVX512DQ-NEXT:    vmovdqa64 %xmm30, %xmm4
9504; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
9505; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1]
9506; AVX512DQ-NEXT:    vmovdqa64 %ymm16, %ymm4
9507; AVX512DQ-NEXT:    vmovdqa64 %ymm17, %ymm5
9508; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
9509; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm16
9510; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
9511; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm16 # 64-byte Folded Reload
9512; AVX512DQ-NEXT:    # zmm16 = zmm16 ^ (zmm28 & (zmm16 ^ mem))
9513; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm1 = ymm4[2,2,2,3]
9514; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm2, %zmm3 {%k1}
9515; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm3, %ymm1
9516; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
9517; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
9518; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
9519; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm14
9520; AVX512DQ-NEXT:    vmovdqa64 %xmm31, %xmm0
9521; AVX512DQ-NEXT:    vpshufb %xmm9, %xmm0, %xmm2
9522; AVX512DQ-NEXT:    vmovdqa %xmm9, %xmm5
9523; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
9524; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
9525; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm8
9526; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm27
9527; AVX512DQ-NEXT:    vmovdqa 64(%r9), %xmm9
9528; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm9[0,0,2,1,4,5,6,7]
9529; AVX512DQ-NEXT:    vpbroadcastq %xmm1, %ymm1
9530; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm9[0,2,2,3,4,5,6,7]
9531; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
9532; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
9533; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
9534; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm8[0,0,2,1,4,5,6,7]
9535; AVX512DQ-NEXT:    vpbroadcastq %xmm2, %ymm0
9536; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm8[0,2,2,3,4,5,6,7]
9537; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
9538; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm12 = ymm2[0,0,2,1]
9539; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm14[0,0,2,1,4,5,6,7]
9540; AVX512DQ-NEXT:    vpbroadcastq %xmm2, %ymm29
9541; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9542; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
9543; AVX512DQ-NEXT:    # xmm3 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
9544; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm14[0,2,2,3,4,5,6,7]
9545; AVX512DQ-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
9546; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm30 = ymm2[0,0,2,1]
9547; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11]
9548; AVX512DQ-NEXT:    vmovdqa 96(%r9), %ymm11
9549; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm24, %zmm2
9550; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
9551; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
9552; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm24 = ymm3[2,2,2,3]
9553; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm4 = ymm11[2,3,2,3,6,7,6,7]
9554; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
9555; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm31 = ymm4[2,1,2,3]
9556; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9557; AVX512DQ-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm10 # 16-byte Folded Reload
9558; AVX512DQ-NEXT:    # xmm10 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
9559; AVX512DQ-NEXT:    vmovdqa 64(%r9), %ymm4
9560; AVX512DQ-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9561; AVX512DQ-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload
9562; AVX512DQ-NEXT:    # ymm7 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11]
9563; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
9564; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
9565; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm18 = ymm13[2,2,2,3]
9566; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[1,1,1,1]
9567; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
9568; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm7, %zmm10, %zmm2 {%k1}
9569; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm7 = ymm4[2,3,2,3,6,7,6,7]
9570; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm7[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
9571; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3]
9572; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm10
9573; AVX512DQ-NEXT:    vmovdqa64 %ymm19, %ymm3
9574; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
9575; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2]
9576; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm15 = ymm10[0,1],ymm15[2],ymm10[3,4],ymm15[5],ymm10[6,7]
9577; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm10
9578; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9579; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
9580; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
9581; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
9582; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
9583; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1]
9584; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
9585; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm3 = ymm10[2,3,2,3,6,7,6,7]
9586; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
9587; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
9588; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm1 # 64-byte Folded Reload
9589; AVX512DQ-NEXT:    # zmm1 = zmm1 ^ (zmm28 & (zmm1 ^ mem))
9590; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm12, %zmm0, %zmm12
9591; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm15, %zmm2, %zmm2
9592; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm15
9593; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm15[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
9594; AVX512DQ-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
9595; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
9596; AVX512DQ-NEXT:    vpshufd {{.*#+}} ymm13 = ymm15[2,3,2,3,6,7,6,7]
9597; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm13 = ymm13[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
9598; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3]
9599; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm14 = xmm14[2,3,2,3]
9600; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,1,4,5,6,7]
9601; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1]
9602; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm15 = ymm15[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
9603; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2]
9604; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
9605; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,1,4,5,6,7]
9606; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
9607; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
9608; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
9609; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[2,3,2,3]
9610; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,1,4,5,6,7]
9611; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
9612; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm4[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
9613; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
9614; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3]
9615; AVX512DQ-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,1,4,5,6,7]
9616; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
9617; AVX512DQ-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
9618; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2]
9619; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm12 # 64-byte Folded Reload
9620; AVX512DQ-NEXT:    # zmm12 = zmm12 ^ (zmm28 & (zmm12 ^ mem))
9621; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm30, %zmm29, %zmm17
9622; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm17 # 64-byte Folded Reload
9623; AVX512DQ-NEXT:    # zmm17 = zmm17 ^ (zmm28 & (zmm17 ^ mem))
9624; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm31, %zmm24, %zmm22
9625; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
9626; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm22 # 64-byte Folded Reload
9627; AVX512DQ-NEXT:    # zmm22 = zmm22 ^ (zmm24 & (zmm22 ^ mem))
9628; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm18, %zmm7
9629; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm7 # 64-byte Folded Reload
9630; AVX512DQ-NEXT:    # zmm7 = zmm7 ^ (zmm24 & (zmm7 ^ mem))
9631; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm5, %zmm3
9632; AVX512DQ-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm3 # 64-byte Folded Reload
9633; AVX512DQ-NEXT:    # zmm3 = zmm3 ^ (zmm24 & (zmm3 ^ mem))
9634; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm13, %zmm0, %zmm0
9635; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (zmm24 & (zmm0 ^ zmm23))
9636; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm15, %zmm14, %zmm5
9637; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
9638; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm5 = zmm5 ^ (zmm13 & (zmm5 ^ zmm25))
9639; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm11, %zmm6, %zmm6
9640; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm6 = zmm6 ^ (zmm13 & (zmm6 ^ zmm26))
9641; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm9, %zmm4
9642; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm4 ^ (zmm13 & (zmm4 ^ zmm27))
9643; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm8
9644; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm8 = zmm8 ^ (zmm13 & (zmm8 ^ zmm2))
9645; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
9646; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 256(%rax)
9647; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 448(%rax)
9648; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 640(%rax)
9649; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 64(%rax)
9650; AVX512DQ-NEXT:    vmovdqa64 %zmm17, (%rax)
9651; AVX512DQ-NEXT:    vmovdqa64 %zmm12, 192(%rax)
9652; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 384(%rax)
9653; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 576(%rax)
9654; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 128(%rax)
9655; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 320(%rax)
9656; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 512(%rax)
9657; AVX512DQ-NEXT:    vmovdqa64 %zmm22, 704(%rax)
9658; AVX512DQ-NEXT:    addq $584, %rsp # imm = 0x248
9659; AVX512DQ-NEXT:    vzeroupper
9660; AVX512DQ-NEXT:    retq
9661;
9662; AVX512DQ-FCP-LABEL: store_i16_stride6_vf64:
9663; AVX512DQ-FCP:       # %bb.0:
9664; AVX512DQ-FCP-NEXT:    subq $1176, %rsp # imm = 0x498
9665; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm1
9666; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9667; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm0
9668; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9669; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
9670; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm1
9671; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9672; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm4
9673; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9674; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %xmm6
9675; AVX512DQ-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9676; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm2
9677; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9678; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm5
9679; AVX512DQ-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9680; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %xmm3
9681; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9682; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
9683; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
9684; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %ymm2
9685; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %ymm7
9686; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[8],ymm2[8],ymm7[9],ymm2[9],ymm7[10],ymm2[10],ymm7[11],ymm2[11]
9687; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm7, %ymm29
9688; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm2, %ymm31
9689; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %xmm2
9690; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9691; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %xmm7
9692; AVX512DQ-FCP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9693; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
9694; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm2
9695; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %ymm1
9696; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9697; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %ymm8
9698; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11]
9699; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm8, %ymm30
9700; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
9701; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm20
9702; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm3
9703; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9704; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm1
9705; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9706; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
9707; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
9708; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm16
9709; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm17 = [1,1,1,1,10,10,10,11]
9710; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm3
9711; AVX512DQ-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9712; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm1
9713; AVX512DQ-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9714; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
9715; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm4
9716; AVX512DQ-FCP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9717; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm3
9718; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9719; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
9720; AVX512DQ-FCP-NEXT:    vpermt2q %zmm1, %zmm17, %zmm3
9721; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [1,2,1,2,0,0,3,3,13,12,10,10,13,12,14,14]
9722; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm19, %zmm21
9723; AVX512DQ-FCP-NEXT:    movw $18724, %ax # imm = 0x4924
9724; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
9725; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm3, %zmm21 {%k1}
9726; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm24 = [0,9,2,3,8,5,6,11]
9727; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm21, %ymm1
9728; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm0
9729; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9730; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
9731; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
9732; AVX512DQ-FCP-NEXT:    vpermt2d %ymm0, %ymm24, %ymm1
9733; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9734; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [8,9,20,11,12,21,14,15]
9735; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm0
9736; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9737; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
9738; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm21
9739; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm22 = [0,0,0,1,0,10,10,0]
9740; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm0
9741; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9742; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
9743; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm1
9744; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9745; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm14 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
9746; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
9747; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm22, %zmm1
9748; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9749; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %ymm9
9750; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm5
9751; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm9[0],ymm5[1],ymm9[1],ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[8],ymm9[8],ymm5[9],ymm9[9],ymm5[10],ymm9[10],ymm5[11],ymm9[11]
9752; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %xmm0
9753; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm15
9754; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
9755; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm0, %xmm23
9756; AVX512DQ-FCP-NEXT:    vpermt2q %zmm3, %zmm17, %zmm4
9757; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm19, %zmm25
9758; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm4, %zmm25 {%k1}
9759; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm25, %ymm0
9760; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %xmm1
9761; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9762; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm1, %xmm2
9763; AVX512DQ-FCP-NEXT:    vpermt2d %ymm2, %ymm24, %ymm0
9764; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9765; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %ymm10
9766; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
9767; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm18, %zmm25
9768; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %ymm11
9769; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
9770; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %xmm0
9771; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9772; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm0, %xmm0
9773; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm22, %zmm0
9774; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9775; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %xmm0
9776; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9777; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm1
9778; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9779; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %ymm13
9780; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm12
9781; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11]
9782; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
9783; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm17, %zmm6
9784; AVX512DQ-FCP-NEXT:    vpermd %zmm20, %zmm19, %zmm28
9785; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm6, %zmm28 {%k1}
9786; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm28, %ymm20
9787; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %xmm0
9788; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9789; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm6
9790; AVX512DQ-FCP-NEXT:    vpermt2d %ymm6, %ymm24, %ymm20
9791; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %ymm6
9792; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
9793; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm18, %zmm28
9794; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %xmm1
9795; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9796; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %ymm7
9797; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
9798; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
9799; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm22, %zmm1
9800; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9801; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm3
9802; AVX512DQ-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9803; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
9804; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
9805; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm2
9806; AVX512DQ-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9807; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
9808; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9809; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
9810; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
9811; AVX512DQ-FCP-NEXT:    vpermt2q %zmm4, %zmm17, %zmm0
9812; AVX512DQ-FCP-NEXT:    vpermd %zmm16, %zmm19, %zmm19
9813; AVX512DQ-FCP-NEXT:    vmovdqa32 %zmm0, %zmm19 {%k1}
9814; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm0
9815; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9816; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
9817; AVX512DQ-FCP-NEXT:    vpermi2d %ymm0, %ymm19, %ymm24
9818; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm0
9819; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9820; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
9821; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm19
9822; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm0
9823; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9824; AVX512DQ-FCP-NEXT:    vpshufb %xmm14, %xmm0, %xmm1
9825; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm0
9826; AVX512DQ-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9827; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
9828; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm22, %zmm1
9829; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9830; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
9831; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm9, %ymm0
9832; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm5, %ymm1
9833; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
9834; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm5[4],ymm9[4],ymm5[5],ymm9[5],ymm5[6],ymm9[6],ymm5[7],ymm9[7],ymm5[12],ymm9[12],ymm5[13],ymm9[13],ymm5[14],ymm9[14],ymm5[15],ymm9[15]
9835; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm27 = [2,1,2,3,11,11,11,11]
9836; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm27, %zmm8
9837; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm26 = [5,6,5,6,5,6,7,7]
9838; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm29, %ymm2
9839; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm31, %ymm1
9840; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
9841; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm26, %ymm0
9842; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
9843; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm5 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
9844; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11]
9845; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
9846; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm0, %zmm1, %zmm8 {%k1}
9847; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [12,1,2,13,4,5,14,7]
9848; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, %ymm0
9849; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
9850; AVX512DQ-FCP-NEXT:    vpermt2d %ymm1, %ymm22, %ymm0
9851; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
9852; AVX512DQ-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm1
9853; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm10
9854; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm29 = [8,21,10,11,20,13,14,23]
9855; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm29, %zmm8
9856; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
9857; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9858; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
9859; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm11, %ymm0
9860; AVX512DQ-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
9861; AVX512DQ-FCP-NEXT:    # ymm8 = mem[0,1,0,1]
9862; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm11, %ymm1
9863; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm31 = [2,2,0,3,10,0,10,11]
9864; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm31, %zmm1
9865; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm13, %ymm0
9866; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm12, %ymm2
9867; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
9868; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15]
9869; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm27, %zmm0
9870; AVX512DQ-FCP-NEXT:    vmovdqa64 %ymm30, %ymm4
9871; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9872; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
9873; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm26, %ymm2
9874; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
9875; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
9876; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
9877; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
9878; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm2, %zmm3, %zmm0 {%k1}
9879; AVX512DQ-FCP-NEXT:    vmovdqa %ymm0, %ymm2
9880; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
9881; AVX512DQ-FCP-NEXT:    vpermt2d %ymm3, %ymm22, %ymm2
9882; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm6, %ymm3
9883; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm29, %zmm0
9884; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
9885; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9886; AVX512DQ-FCP-NEXT:    vpshufb %ymm5, %ymm7, %ymm0
9887; AVX512DQ-FCP-NEXT:    vpshufb %ymm8, %ymm7, %ymm3
9888; AVX512DQ-FCP-NEXT:    vmovdqa %ymm8, %ymm12
9889; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm31, %zmm3
9890; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
9891; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm23, %xmm4
9892; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm4, %xmm0
9893; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm15, %xmm2
9894; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
9895; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
9896; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm18 = [0,0,2,1,8,9,8,9]
9897; AVX512DQ-FCP-NEXT:    vpermt2q %zmm0, %zmm18, %zmm2
9898; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm30 = [1,0,2,2,1,0,2,2]
9899; AVX512DQ-FCP-NEXT:    # ymm30 = mem[0,1,2,3,0,1,2,3]
9900; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9901; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9902; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
9903; AVX512DQ-FCP-NEXT:    vpermd %ymm0, %ymm30, %ymm0
9904; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9905; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9906; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
9907; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm4, %ymm4
9908; AVX512DQ-FCP-NEXT:    movw $9362, %ax # imm = 0x2492
9909; AVX512DQ-FCP-NEXT:    kmovw %eax, %k2
9910; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm4, %zmm0, %zmm2 {%k2}
9911; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9912; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
9913; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7]
9914; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, %ymm5
9915; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm17 = [16,9,10,17,12,13,18,15]
9916; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm17, %zmm2
9917; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [0,1,8,3,4,9,6,7]
9918; AVX512DQ-FCP-NEXT:    vpermt2d %ymm4, %ymm0, %ymm5
9919; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm5, %zmm2
9920; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9921; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
9922; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9923; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm5, %xmm2
9924; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm5[0,0,2,1,4,5,6,7]
9925; AVX512DQ-FCP-NEXT:    vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,8,8,0,9]
9926; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm16, %zmm4
9927; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9928; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm6, %xmm2
9929; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9930; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm7, %xmm5
9931; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
9932; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
9933; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm18, %zmm5
9934; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9935; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9936; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
9937; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9938; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9939; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
9940; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm30, %ymm2
9941; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm6, %ymm6
9942; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm6, %zmm2, %zmm5 {%k2}
9943; AVX512DQ-FCP-NEXT:    vmovdqa %ymm5, %ymm2
9944; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9945; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm7[2,1,3,3,4,5,6,7]
9946; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm17, %zmm5
9947; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm6 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
9948; AVX512DQ-FCP-NEXT:    vpermt2d %ymm6, %ymm0, %ymm2
9949; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm2, %zmm23
9950; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9951; AVX512DQ-FCP-NEXT:    vpshufb %xmm9, %xmm5, %xmm2
9952; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,0,2,1,4,5,6,7]
9953; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm16, %zmm5
9954; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9955; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm9, %ymm2
9956; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9957; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm7, %ymm6
9958; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11]
9959; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm9[4],ymm7[5],ymm9[5],ymm7[6],ymm9[6],ymm7[7],ymm9[7],ymm7[12],ymm9[12],ymm7[13],ymm9[13],ymm7[14],ymm9[14],ymm7[15],ymm9[15]
9960; AVX512DQ-FCP-NEXT:    vpermt2q %zmm2, %zmm27, %zmm6
9961; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
9962; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9963; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm7[4],ymm9[5],ymm7[5],ymm9[6],ymm7[6],ymm9[7],ymm7[7],ymm9[12],ymm7[12],ymm9[13],ymm7[13],ymm9[14],ymm7[14],ymm9[15],ymm7[15]
9964; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm7 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
9965; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm9 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
9966; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[8],ymm7[8],ymm9[9],ymm7[9],ymm9[10],ymm7[10],ymm9[11],ymm7[11]
9967; AVX512DQ-FCP-NEXT:    vpermd %ymm2, %ymm26, %ymm2
9968; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2]
9969; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm2, %zmm7, %zmm6 {%k1}
9970; AVX512DQ-FCP-NEXT:    vmovdqa %ymm6, %ymm7
9971; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9972; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
9973; AVX512DQ-FCP-NEXT:    vpermt2d %ymm2, %ymm22, %ymm7
9974; AVX512DQ-FCP-NEXT:    vmovdqa %ymm10, %ymm13
9975; AVX512DQ-FCP-NEXT:    vpshufb %ymm10, %ymm9, %ymm2
9976; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm29, %zmm6
9977; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
9978; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm11, %ymm2
9979; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
9980; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm10, %ymm9
9981; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[8],ymm2[8],ymm9[9],ymm2[9],ymm9[10],ymm2[10],ymm9[11],ymm2[11]
9982; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm9 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15]
9983; AVX512DQ-FCP-NEXT:    vpermt2q %zmm9, %zmm27, %zmm2
9984; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm7, %zmm27
9985; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
9986; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
9987; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm6, %ymm7
9988; AVX512DQ-FCP-NEXT:    vmovdqa %ymm12, %ymm15
9989; AVX512DQ-FCP-NEXT:    vpshufb %ymm12, %ymm6, %ymm6
9990; AVX512DQ-FCP-NEXT:    vpermt2q %zmm7, %zmm31, %zmm6
9991; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
9992; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
9993; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm7 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15]
9994; AVX512DQ-FCP-NEXT:    vpermd %ymm7, %ymm26, %ymm7
9995; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm11 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
9996; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} ymm12 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
9997; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11]
9998; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
9999; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm7, %zmm11, %zmm2 {%k1}
10000; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
10001; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
10002; AVX512DQ-FCP-NEXT:    vpermi2d %ymm7, %ymm2, %ymm22
10003; AVX512DQ-FCP-NEXT:    vpshufb %ymm13, %ymm9, %ymm7
10004; AVX512DQ-FCP-NEXT:    vpermt2d %zmm7, %zmm29, %zmm2
10005; AVX512DQ-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10006; AVX512DQ-FCP-NEXT:    vpshufb %ymm14, %ymm7, %ymm11
10007; AVX512DQ-FCP-NEXT:    vpshufb %ymm15, %ymm7, %ymm7
10008; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm31, %zmm7
10009; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10010; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm9, %xmm11
10011; AVX512DQ-FCP-NEXT:    vmovdqa (%rsp), %xmm10 # 16-byte Reload
10012; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm10, %xmm12
10013; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
10014; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
10015; AVX512DQ-FCP-NEXT:    vpermt2q %zmm12, %zmm18, %zmm11
10016; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10017; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
10018; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
10019; AVX512DQ-FCP-NEXT:    vpermd %ymm12, %ymm30, %ymm12
10020; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm13 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
10021; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm14 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
10022; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
10023; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm13, %ymm13
10024; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
10025; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm10, %xmm14
10026; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10027; AVX512DQ-FCP-NEXT:    vpshufb %xmm8, %xmm9, %xmm8
10028; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm14 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7]
10029; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
10030; AVX512DQ-FCP-NEXT:    vpermt2q %zmm14, %zmm18, %zmm8
10031; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm13, %zmm12, %zmm11 {%k2}
10032; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10033; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm12 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero
10034; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm9[2,1,3,3,4,5,6,7]
10035; AVX512DQ-FCP-NEXT:    vmovdqa %ymm11, %ymm14
10036; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm17, %zmm11
10037; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
10038; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10039; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
10040; AVX512DQ-FCP-NEXT:    vpermd %ymm13, %ymm30, %ymm13
10041; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
10042; AVX512DQ-FCP-NEXT:    vpsrldq {{.*#+}} xmm10 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
10043; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3]
10044; AVX512DQ-FCP-NEXT:    vpbroadcastq %xmm10, %ymm10
10045; AVX512DQ-FCP-NEXT:    vinserti32x8 $1, %ymm10, %zmm13, %zmm8 {%k2}
10046; AVX512DQ-FCP-NEXT:    vpermt2d %ymm12, %ymm0, %ymm14
10047; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10048; AVX512DQ-FCP-NEXT:    vpmovzxwd {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero
10049; AVX512DQ-FCP-NEXT:    vpermi2d %ymm10, %ymm8, %ymm0
10050; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm9[2,1,3,3,4,5,6,7]
10051; AVX512DQ-FCP-NEXT:    vpermt2d %zmm10, %zmm17, %zmm8
10052; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm22, %zmm2
10053; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm14, %zmm10
10054; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10055; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
10056; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm9, %xmm11
10057; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm9[0,0,2,1,4,5,6,7]
10058; AVX512DQ-FCP-NEXT:    vpermt2q %zmm11, %zmm16, %zmm12
10059; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
10060; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10061; AVX512DQ-FCP-NEXT:    vpshufb %xmm13, %xmm9, %xmm8
10062; AVX512DQ-FCP-NEXT:    vpshuflw {{.*#+}} xmm11 = xmm9[0,0,2,1,4,5,6,7]
10063; AVX512DQ-FCP-NEXT:    vpermt2q %zmm8, %zmm16, %zmm11
10064; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
10065; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm11 = zmm11 ^ (zmm8 & (zmm11 ^ zmm0))
10066; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10067; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, (%rax)
10068; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm12 = zmm12 ^ (zmm8 & (zmm12 ^ zmm10))
10069; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, 192(%rax)
10070; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
10071; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm2))
10072; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 128(%rax)
10073; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm6 = zmm6 ^ (zmm0 & (zmm6 ^ zmm27))
10074; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 320(%rax)
10075; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm24, %zmm2
10076; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
10077; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10078; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm7 = zmm7 ^ (zmm6 & (zmm7 ^ zmm2))
10079; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 256(%rax)
10080; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm28, %zmm20, %zmm2
10081; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10082; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm7 = zmm7 ^ (zmm6 & (zmm7 ^ zmm2))
10083; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, 448(%rax)
10084; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm5 = zmm5 ^ (zmm8 & (zmm5 ^ zmm23))
10085; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 384(%rax)
10086; AVX512DQ-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 64-byte Folded Reload
10087; AVX512DQ-FCP-NEXT:    # zmm4 = zmm4 ^ (zmm8 & (zmm4 ^ mem))
10088; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 576(%rax)
10089; AVX512DQ-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
10090; AVX512DQ-FCP-NEXT:    # zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ mem))
10091; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 512(%rax)
10092; AVX512DQ-FCP-NEXT:    vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
10093; AVX512DQ-FCP-NEXT:    # zmm1 = zmm1 ^ (zmm0 & (zmm1 ^ mem))
10094; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 704(%rax)
10095; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10096; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm25, %zmm0, %zmm0
10097; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10098; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 ^ (zmm6 & (zmm1 ^ zmm0))
10099; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 640(%rax)
10100; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10101; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm21, %zmm0, %zmm0
10102; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10103; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 ^ (zmm6 & (zmm1 ^ zmm0))
10104; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
10105; AVX512DQ-FCP-NEXT:    addq $1176, %rsp # imm = 0x498
10106; AVX512DQ-FCP-NEXT:    vzeroupper
10107; AVX512DQ-FCP-NEXT:    retq
10108;
10109; AVX512BW-LABEL: store_i16_stride6_vf64:
10110; AVX512BW:       # %bb.0:
10111; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm3
10112; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm0
10113; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm16
10114; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm24
10115; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm1
10116; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm7
10117; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm12
10118; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm15
10119; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34]
10120; AVX512BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
10121; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm13
10122; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm19, %zmm13
10123; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37]
10124; AVX512BW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
10125; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm2
10126; AVX512BW-NEXT:    vpermt2w %zmm16, %zmm26, %zmm2
10127; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0]
10128; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
10129; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm20
10130; AVX512BW-NEXT:    vpermt2w %zmm15, %zmm11, %zmm20
10131; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61]
10132; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
10133; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm4
10134; AVX512BW-NEXT:    vpermt2w %zmm24, %zmm5, %zmm4
10135; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0]
10136; AVX512BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
10137; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm22
10138; AVX512BW-NEXT:    vpermt2w %zmm24, %zmm18, %zmm22
10139; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58]
10140; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
10141; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm6
10142; AVX512BW-NEXT:    vpermt2w %zmm15, %zmm8, %zmm6
10143; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50]
10144; AVX512BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
10145; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm23
10146; AVX512BW-NEXT:    vpermt2w %zmm15, %zmm21, %zmm23
10147; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53]
10148; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
10149; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm9
10150; AVX512BW-NEXT:    vpermt2w %zmm24, %zmm10, %zmm9
10151; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45]
10152; AVX512BW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
10153; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm14
10154; AVX512BW-NEXT:    vpermt2w %zmm24, %zmm17, %zmm14
10155; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm25
10156; AVX512BW-NEXT:    vpermt2w %zmm24, %zmm26, %zmm0
10157; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0]
10158; AVX512BW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
10159; AVX512BW-NEXT:    vpermt2w %zmm24, %zmm26, %zmm25
10160; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0]
10161; AVX512BW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
10162; AVX512BW-NEXT:    vpermi2w %zmm16, %zmm3, %zmm5
10163; AVX512BW-NEXT:    vpermi2w %zmm16, %zmm3, %zmm18
10164; AVX512BW-NEXT:    vpermi2w %zmm16, %zmm3, %zmm10
10165; AVX512BW-NEXT:    vpermi2w %zmm16, %zmm3, %zmm17
10166; AVX512BW-NEXT:    vpermt2w %zmm16, %zmm26, %zmm3
10167; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm26
10168; AVX512BW-NEXT:    vpermt2w %zmm15, %zmm24, %zmm26
10169; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm16
10170; AVX512BW-NEXT:    vpermt2w %zmm15, %zmm19, %zmm7
10171; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42]
10172; AVX512BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
10173; AVX512BW-NEXT:    vpermt2w %zmm15, %zmm19, %zmm16
10174; AVX512BW-NEXT:    vpermi2w %zmm12, %zmm1, %zmm11
10175; AVX512BW-NEXT:    vpermi2w %zmm12, %zmm1, %zmm8
10176; AVX512BW-NEXT:    vpermi2w %zmm12, %zmm1, %zmm21
10177; AVX512BW-NEXT:    vpermi2w %zmm12, %zmm1, %zmm24
10178; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm19, %zmm1
10179; AVX512BW-NEXT:    movw $9362, %ax # imm = 0x2492
10180; AVX512BW-NEXT:    kmovd %eax, %k2
10181; AVX512BW-NEXT:    vmovdqa32 %zmm13, %zmm2 {%k2}
10182; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm12
10183; AVX512BW-NEXT:    movw $18724, %ax # imm = 0x4924
10184; AVX512BW-NEXT:    kmovd %eax, %k1
10185; AVX512BW-NEXT:    vmovdqa32 %zmm20, %zmm4 {%k1}
10186; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm13
10187; AVX512BW-NEXT:    vmovdqa32 %zmm22, %zmm6 {%k1}
10188; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31]
10189; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm15, %zmm2
10190; AVX512BW-NEXT:    vmovdqa32 %zmm23, %zmm9 {%k2}
10191; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm19 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0]
10192; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm19, %zmm4
10193; AVX512BW-NEXT:    vmovdqa32 %zmm26, %zmm14 {%k1}
10194; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm20 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31]
10195; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm20, %zmm6
10196; AVX512BW-NEXT:    vmovdqa32 %zmm25, %zmm16 {%k1}
10197; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm22 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31]
10198; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm22, %zmm9
10199; AVX512BW-NEXT:    vmovdqa32 %zmm7, %zmm0 {%k2}
10200; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0]
10201; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm7, %zmm14
10202; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm15, %zmm0
10203; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm15 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31]
10204; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm15, %zmm16
10205; AVX512BW-NEXT:    vmovdqa32 %zmm11, %zmm5 {%k1}
10206; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm19, %zmm5
10207; AVX512BW-NEXT:    vmovdqa32 %zmm18, %zmm8 {%k1}
10208; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm20, %zmm8
10209; AVX512BW-NEXT:    vmovdqa32 %zmm21, %zmm10 {%k2}
10210; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm11
10211; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm22, %zmm10
10212; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm13
10213; AVX512BW-NEXT:    vmovdqa32 %zmm24, %zmm17 {%k1}
10214; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31]
10215; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm18, %zmm2
10216; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm7, %zmm17
10217; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63]
10218; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm7, %zmm4
10219; AVX512BW-NEXT:    vmovdqa32 %zmm3, %zmm1 {%k1}
10220; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31]
10221; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm3, %zmm6
10222; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm15, %zmm1
10223; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm12 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31]
10224; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm12, %zmm9
10225; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47]
10226; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm15, %zmm14
10227; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm18, %zmm0
10228; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm18 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31]
10229; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm18, %zmm16
10230; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm7, %zmm5
10231; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm3, %zmm8
10232; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm12, %zmm10
10233; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm15, %zmm17
10234; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm18, %zmm1
10235; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10236; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rax)
10237; AVX512BW-NEXT:    vmovdqa64 %zmm17, 128(%rax)
10238; AVX512BW-NEXT:    vmovdqa64 %zmm10, 192(%rax)
10239; AVX512BW-NEXT:    vmovdqa64 %zmm8, 256(%rax)
10240; AVX512BW-NEXT:    vmovdqa64 %zmm5, 320(%rax)
10241; AVX512BW-NEXT:    vmovdqa64 %zmm0, 384(%rax)
10242; AVX512BW-NEXT:    vmovdqa64 %zmm16, 448(%rax)
10243; AVX512BW-NEXT:    vmovdqa64 %zmm14, 512(%rax)
10244; AVX512BW-NEXT:    vmovdqa64 %zmm9, 576(%rax)
10245; AVX512BW-NEXT:    vmovdqa64 %zmm6, 640(%rax)
10246; AVX512BW-NEXT:    vmovdqa64 %zmm4, 704(%rax)
10247; AVX512BW-NEXT:    vmovdqa64 %zmm2, (%rax)
10248; AVX512BW-NEXT:    vzeroupper
10249; AVX512BW-NEXT:    retq
10250;
10251; AVX512BW-FCP-LABEL: store_i16_stride6_vf64:
10252; AVX512BW-FCP:       # %bb.0:
10253; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm3
10254; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm0
10255; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %zmm16
10256; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm24
10257; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm1
10258; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm7
10259; AVX512BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm12
10260; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm15
10261; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34]
10262; AVX512BW-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
10263; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm13
10264; AVX512BW-FCP-NEXT:    vpermt2w %zmm12, %zmm19, %zmm13
10265; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37]
10266; AVX512BW-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
10267; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm2
10268; AVX512BW-FCP-NEXT:    vpermt2w %zmm16, %zmm26, %zmm2
10269; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0]
10270; AVX512BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
10271; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm20
10272; AVX512BW-FCP-NEXT:    vpermt2w %zmm15, %zmm11, %zmm20
10273; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61]
10274; AVX512BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
10275; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4
10276; AVX512BW-FCP-NEXT:    vpermt2w %zmm24, %zmm5, %zmm4
10277; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0]
10278; AVX512BW-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
10279; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm22
10280; AVX512BW-FCP-NEXT:    vpermt2w %zmm24, %zmm18, %zmm22
10281; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58]
10282; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
10283; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm6
10284; AVX512BW-FCP-NEXT:    vpermt2w %zmm15, %zmm8, %zmm6
10285; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50]
10286; AVX512BW-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
10287; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm23
10288; AVX512BW-FCP-NEXT:    vpermt2w %zmm15, %zmm21, %zmm23
10289; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53]
10290; AVX512BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
10291; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9
10292; AVX512BW-FCP-NEXT:    vpermt2w %zmm24, %zmm10, %zmm9
10293; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45]
10294; AVX512BW-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
10295; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm14
10296; AVX512BW-FCP-NEXT:    vpermt2w %zmm24, %zmm17, %zmm14
10297; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm25
10298; AVX512BW-FCP-NEXT:    vpermt2w %zmm24, %zmm26, %zmm0
10299; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0]
10300; AVX512BW-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
10301; AVX512BW-FCP-NEXT:    vpermt2w %zmm24, %zmm26, %zmm25
10302; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0]
10303; AVX512BW-FCP-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
10304; AVX512BW-FCP-NEXT:    vpermi2w %zmm16, %zmm3, %zmm5
10305; AVX512BW-FCP-NEXT:    vpermi2w %zmm16, %zmm3, %zmm18
10306; AVX512BW-FCP-NEXT:    vpermi2w %zmm16, %zmm3, %zmm10
10307; AVX512BW-FCP-NEXT:    vpermi2w %zmm16, %zmm3, %zmm17
10308; AVX512BW-FCP-NEXT:    vpermt2w %zmm16, %zmm26, %zmm3
10309; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm26
10310; AVX512BW-FCP-NEXT:    vpermt2w %zmm15, %zmm24, %zmm26
10311; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm16
10312; AVX512BW-FCP-NEXT:    vpermt2w %zmm15, %zmm19, %zmm7
10313; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42]
10314; AVX512BW-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
10315; AVX512BW-FCP-NEXT:    vpermt2w %zmm15, %zmm19, %zmm16
10316; AVX512BW-FCP-NEXT:    vpermi2w %zmm12, %zmm1, %zmm11
10317; AVX512BW-FCP-NEXT:    vpermi2w %zmm12, %zmm1, %zmm8
10318; AVX512BW-FCP-NEXT:    vpermi2w %zmm12, %zmm1, %zmm21
10319; AVX512BW-FCP-NEXT:    vpermi2w %zmm12, %zmm1, %zmm24
10320; AVX512BW-FCP-NEXT:    vpermt2w %zmm12, %zmm19, %zmm1
10321; AVX512BW-FCP-NEXT:    movw $9362, %ax # imm = 0x2492
10322; AVX512BW-FCP-NEXT:    kmovd %eax, %k2
10323; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm13, %zmm2 {%k2}
10324; AVX512BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm12
10325; AVX512BW-FCP-NEXT:    movw $18724, %ax # imm = 0x4924
10326; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
10327; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm20, %zmm4 {%k1}
10328; AVX512BW-FCP-NEXT:    vmovdqa64 64(%r8), %zmm13
10329; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm22, %zmm6 {%k1}
10330; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31]
10331; AVX512BW-FCP-NEXT:    vpermt2w %zmm12, %zmm15, %zmm2
10332; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm23, %zmm9 {%k2}
10333; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm19 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0]
10334; AVX512BW-FCP-NEXT:    vpermt2w %zmm13, %zmm19, %zmm4
10335; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm26, %zmm14 {%k1}
10336; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm20 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31]
10337; AVX512BW-FCP-NEXT:    vpermt2w %zmm13, %zmm20, %zmm6
10338; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm25, %zmm16 {%k1}
10339; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm22 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31]
10340; AVX512BW-FCP-NEXT:    vpermt2w %zmm13, %zmm22, %zmm9
10341; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm7, %zmm0 {%k2}
10342; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0]
10343; AVX512BW-FCP-NEXT:    vpermt2w %zmm13, %zmm7, %zmm14
10344; AVX512BW-FCP-NEXT:    vpermt2w %zmm13, %zmm15, %zmm0
10345; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm15 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31]
10346; AVX512BW-FCP-NEXT:    vpermt2w %zmm13, %zmm15, %zmm16
10347; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm11, %zmm5 {%k1}
10348; AVX512BW-FCP-NEXT:    vpermt2w %zmm12, %zmm19, %zmm5
10349; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm18, %zmm8 {%k1}
10350; AVX512BW-FCP-NEXT:    vpermt2w %zmm12, %zmm20, %zmm8
10351; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm21, %zmm10 {%k2}
10352; AVX512BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm11
10353; AVX512BW-FCP-NEXT:    vpermt2w %zmm12, %zmm22, %zmm10
10354; AVX512BW-FCP-NEXT:    vmovdqa64 64(%r9), %zmm13
10355; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm24, %zmm17 {%k1}
10356; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31]
10357; AVX512BW-FCP-NEXT:    vpermt2w %zmm11, %zmm18, %zmm2
10358; AVX512BW-FCP-NEXT:    vpermt2w %zmm12, %zmm7, %zmm17
10359; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63]
10360; AVX512BW-FCP-NEXT:    vpermt2w %zmm13, %zmm7, %zmm4
10361; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm3, %zmm1 {%k1}
10362; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31]
10363; AVX512BW-FCP-NEXT:    vpermt2w %zmm13, %zmm3, %zmm6
10364; AVX512BW-FCP-NEXT:    vpermt2w %zmm12, %zmm15, %zmm1
10365; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm12 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31]
10366; AVX512BW-FCP-NEXT:    vpermt2w %zmm13, %zmm12, %zmm9
10367; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47]
10368; AVX512BW-FCP-NEXT:    vpermt2w %zmm13, %zmm15, %zmm14
10369; AVX512BW-FCP-NEXT:    vpermt2w %zmm13, %zmm18, %zmm0
10370; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm18 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31]
10371; AVX512BW-FCP-NEXT:    vpermt2w %zmm13, %zmm18, %zmm16
10372; AVX512BW-FCP-NEXT:    vpermt2w %zmm11, %zmm7, %zmm5
10373; AVX512BW-FCP-NEXT:    vpermt2w %zmm11, %zmm3, %zmm8
10374; AVX512BW-FCP-NEXT:    vpermt2w %zmm11, %zmm12, %zmm10
10375; AVX512BW-FCP-NEXT:    vpermt2w %zmm11, %zmm15, %zmm17
10376; AVX512BW-FCP-NEXT:    vpermt2w %zmm11, %zmm18, %zmm1
10377; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10378; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
10379; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, 128(%rax)
10380; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, 192(%rax)
10381; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, 256(%rax)
10382; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, 320(%rax)
10383; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 384(%rax)
10384; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, 448(%rax)
10385; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, 512(%rax)
10386; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, 576(%rax)
10387; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, 640(%rax)
10388; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, 704(%rax)
10389; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, (%rax)
10390; AVX512BW-FCP-NEXT:    vzeroupper
10391; AVX512BW-FCP-NEXT:    retq
10392;
10393; AVX512DQ-BW-LABEL: store_i16_stride6_vf64:
10394; AVX512DQ-BW:       # %bb.0:
10395; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm3
10396; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm0
10397; AVX512DQ-BW-NEXT:    vmovdqa64 (%rsi), %zmm16
10398; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rsi), %zmm24
10399; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdx), %zmm1
10400; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdx), %zmm7
10401; AVX512DQ-BW-NEXT:    vmovdqa64 (%rcx), %zmm12
10402; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rcx), %zmm15
10403; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34]
10404; AVX512DQ-BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
10405; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm13
10406; AVX512DQ-BW-NEXT:    vpermt2w %zmm12, %zmm19, %zmm13
10407; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37]
10408; AVX512DQ-BW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
10409; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, %zmm2
10410; AVX512DQ-BW-NEXT:    vpermt2w %zmm16, %zmm26, %zmm2
10411; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0]
10412; AVX512DQ-BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
10413; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm20
10414; AVX512DQ-BW-NEXT:    vpermt2w %zmm15, %zmm11, %zmm20
10415; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61]
10416; AVX512DQ-BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
10417; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm4
10418; AVX512DQ-BW-NEXT:    vpermt2w %zmm24, %zmm5, %zmm4
10419; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0]
10420; AVX512DQ-BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
10421; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm22
10422; AVX512DQ-BW-NEXT:    vpermt2w %zmm24, %zmm18, %zmm22
10423; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58]
10424; AVX512DQ-BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
10425; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm6
10426; AVX512DQ-BW-NEXT:    vpermt2w %zmm15, %zmm8, %zmm6
10427; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50]
10428; AVX512DQ-BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
10429; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm23
10430; AVX512DQ-BW-NEXT:    vpermt2w %zmm15, %zmm21, %zmm23
10431; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53]
10432; AVX512DQ-BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
10433; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm9
10434; AVX512DQ-BW-NEXT:    vpermt2w %zmm24, %zmm10, %zmm9
10435; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45]
10436; AVX512DQ-BW-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
10437; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm14
10438; AVX512DQ-BW-NEXT:    vpermt2w %zmm24, %zmm17, %zmm14
10439; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm25
10440; AVX512DQ-BW-NEXT:    vpermt2w %zmm24, %zmm26, %zmm0
10441; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0]
10442; AVX512DQ-BW-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
10443; AVX512DQ-BW-NEXT:    vpermt2w %zmm24, %zmm26, %zmm25
10444; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0]
10445; AVX512DQ-BW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
10446; AVX512DQ-BW-NEXT:    vpermi2w %zmm16, %zmm3, %zmm5
10447; AVX512DQ-BW-NEXT:    vpermi2w %zmm16, %zmm3, %zmm18
10448; AVX512DQ-BW-NEXT:    vpermi2w %zmm16, %zmm3, %zmm10
10449; AVX512DQ-BW-NEXT:    vpermi2w %zmm16, %zmm3, %zmm17
10450; AVX512DQ-BW-NEXT:    vpermt2w %zmm16, %zmm26, %zmm3
10451; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm26
10452; AVX512DQ-BW-NEXT:    vpermt2w %zmm15, %zmm24, %zmm26
10453; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm16
10454; AVX512DQ-BW-NEXT:    vpermt2w %zmm15, %zmm19, %zmm7
10455; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42]
10456; AVX512DQ-BW-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
10457; AVX512DQ-BW-NEXT:    vpermt2w %zmm15, %zmm19, %zmm16
10458; AVX512DQ-BW-NEXT:    vpermi2w %zmm12, %zmm1, %zmm11
10459; AVX512DQ-BW-NEXT:    vpermi2w %zmm12, %zmm1, %zmm8
10460; AVX512DQ-BW-NEXT:    vpermi2w %zmm12, %zmm1, %zmm21
10461; AVX512DQ-BW-NEXT:    vpermi2w %zmm12, %zmm1, %zmm24
10462; AVX512DQ-BW-NEXT:    vpermt2w %zmm12, %zmm19, %zmm1
10463; AVX512DQ-BW-NEXT:    movw $9362, %ax # imm = 0x2492
10464; AVX512DQ-BW-NEXT:    kmovd %eax, %k2
10465; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm13, %zmm2 {%k2}
10466; AVX512DQ-BW-NEXT:    vmovdqa64 (%r8), %zmm12
10467; AVX512DQ-BW-NEXT:    movw $18724, %ax # imm = 0x4924
10468; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
10469; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm20, %zmm4 {%k1}
10470; AVX512DQ-BW-NEXT:    vmovdqa64 64(%r8), %zmm13
10471; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm22, %zmm6 {%k1}
10472; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31]
10473; AVX512DQ-BW-NEXT:    vpermt2w %zmm12, %zmm15, %zmm2
10474; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm23, %zmm9 {%k2}
10475; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm19 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0]
10476; AVX512DQ-BW-NEXT:    vpermt2w %zmm13, %zmm19, %zmm4
10477; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm26, %zmm14 {%k1}
10478; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm20 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31]
10479; AVX512DQ-BW-NEXT:    vpermt2w %zmm13, %zmm20, %zmm6
10480; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm25, %zmm16 {%k1}
10481; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm22 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31]
10482; AVX512DQ-BW-NEXT:    vpermt2w %zmm13, %zmm22, %zmm9
10483; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm7, %zmm0 {%k2}
10484; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0]
10485; AVX512DQ-BW-NEXT:    vpermt2w %zmm13, %zmm7, %zmm14
10486; AVX512DQ-BW-NEXT:    vpermt2w %zmm13, %zmm15, %zmm0
10487; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm15 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31]
10488; AVX512DQ-BW-NEXT:    vpermt2w %zmm13, %zmm15, %zmm16
10489; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm11, %zmm5 {%k1}
10490; AVX512DQ-BW-NEXT:    vpermt2w %zmm12, %zmm19, %zmm5
10491; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm18, %zmm8 {%k1}
10492; AVX512DQ-BW-NEXT:    vpermt2w %zmm12, %zmm20, %zmm8
10493; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm21, %zmm10 {%k2}
10494; AVX512DQ-BW-NEXT:    vmovdqa64 (%r9), %zmm11
10495; AVX512DQ-BW-NEXT:    vpermt2w %zmm12, %zmm22, %zmm10
10496; AVX512DQ-BW-NEXT:    vmovdqa64 64(%r9), %zmm13
10497; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm24, %zmm17 {%k1}
10498; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31]
10499; AVX512DQ-BW-NEXT:    vpermt2w %zmm11, %zmm18, %zmm2
10500; AVX512DQ-BW-NEXT:    vpermt2w %zmm12, %zmm7, %zmm17
10501; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63]
10502; AVX512DQ-BW-NEXT:    vpermt2w %zmm13, %zmm7, %zmm4
10503; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm3, %zmm1 {%k1}
10504; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31]
10505; AVX512DQ-BW-NEXT:    vpermt2w %zmm13, %zmm3, %zmm6
10506; AVX512DQ-BW-NEXT:    vpermt2w %zmm12, %zmm15, %zmm1
10507; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm12 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31]
10508; AVX512DQ-BW-NEXT:    vpermt2w %zmm13, %zmm12, %zmm9
10509; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47]
10510; AVX512DQ-BW-NEXT:    vpermt2w %zmm13, %zmm15, %zmm14
10511; AVX512DQ-BW-NEXT:    vpermt2w %zmm13, %zmm18, %zmm0
10512; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm18 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31]
10513; AVX512DQ-BW-NEXT:    vpermt2w %zmm13, %zmm18, %zmm16
10514; AVX512DQ-BW-NEXT:    vpermt2w %zmm11, %zmm7, %zmm5
10515; AVX512DQ-BW-NEXT:    vpermt2w %zmm11, %zmm3, %zmm8
10516; AVX512DQ-BW-NEXT:    vpermt2w %zmm11, %zmm12, %zmm10
10517; AVX512DQ-BW-NEXT:    vpermt2w %zmm11, %zmm15, %zmm17
10518; AVX512DQ-BW-NEXT:    vpermt2w %zmm11, %zmm18, %zmm1
10519; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10520; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, 64(%rax)
10521; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, 128(%rax)
10522; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, 192(%rax)
10523; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, 256(%rax)
10524; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, 320(%rax)
10525; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, 384(%rax)
10526; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, 448(%rax)
10527; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, 512(%rax)
10528; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, 576(%rax)
10529; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, 640(%rax)
10530; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, 704(%rax)
10531; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, (%rax)
10532; AVX512DQ-BW-NEXT:    vzeroupper
10533; AVX512DQ-BW-NEXT:    retq
10534;
10535; AVX512DQ-BW-FCP-LABEL: store_i16_stride6_vf64:
10536; AVX512DQ-BW-FCP:       # %bb.0:
10537; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm3
10538; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm0
10539; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rsi), %zmm16
10540; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm24
10541; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm1
10542; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm7
10543; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm12
10544; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm15
10545; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34]
10546; AVX512DQ-BW-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
10547; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm13
10548; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm12, %zmm19, %zmm13
10549; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37]
10550; AVX512DQ-BW-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
10551; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm2
10552; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm16, %zmm26, %zmm2
10553; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0]
10554; AVX512DQ-BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
10555; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm20
10556; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm15, %zmm11, %zmm20
10557; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61]
10558; AVX512DQ-BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
10559; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm4
10560; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm24, %zmm5, %zmm4
10561; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0]
10562; AVX512DQ-BW-FCP-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
10563; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm22
10564; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm24, %zmm18, %zmm22
10565; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58]
10566; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
10567; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm6
10568; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm15, %zmm8, %zmm6
10569; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50]
10570; AVX512DQ-BW-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
10571; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm23
10572; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm15, %zmm21, %zmm23
10573; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53]
10574; AVX512DQ-BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
10575; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9
10576; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm24, %zmm10, %zmm9
10577; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45]
10578; AVX512DQ-BW-FCP-NEXT:    # zmm17 = mem[0,1,2,3,0,1,2,3]
10579; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm14
10580; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm24, %zmm17, %zmm14
10581; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm25
10582; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm24, %zmm26, %zmm0
10583; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0]
10584; AVX512DQ-BW-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
10585; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm24, %zmm26, %zmm25
10586; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0]
10587; AVX512DQ-BW-FCP-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3]
10588; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm16, %zmm3, %zmm5
10589; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm16, %zmm3, %zmm18
10590; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm16, %zmm3, %zmm10
10591; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm16, %zmm3, %zmm17
10592; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm16, %zmm26, %zmm3
10593; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm26
10594; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm15, %zmm24, %zmm26
10595; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm16
10596; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm15, %zmm19, %zmm7
10597; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42]
10598; AVX512DQ-BW-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
10599; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm15, %zmm19, %zmm16
10600; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm12, %zmm1, %zmm11
10601; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm12, %zmm1, %zmm8
10602; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm12, %zmm1, %zmm21
10603; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm12, %zmm1, %zmm24
10604; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm12, %zmm19, %zmm1
10605; AVX512DQ-BW-FCP-NEXT:    movw $9362, %ax # imm = 0x2492
10606; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k2
10607; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm13, %zmm2 {%k2}
10608; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm12
10609; AVX512DQ-BW-FCP-NEXT:    movw $18724, %ax # imm = 0x4924
10610; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
10611; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm20, %zmm4 {%k1}
10612; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%r8), %zmm13
10613; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm22, %zmm6 {%k1}
10614; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,32,0,6,7,8,9,33,0,12,13,14,15,34,0,18,19,20,21,35,0,24,25,26,27,36,0,30,31]
10615; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm12, %zmm15, %zmm2
10616; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm23, %zmm9 {%k2}
10617; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm19 = [58,0,2,3,4,5,59,0,8,9,10,11,60,0,14,15,16,17,61,0,20,21,22,23,62,0,26,27,28,29,63,0]
10618; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm13, %zmm19, %zmm4
10619; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm26, %zmm14 {%k1}
10620; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm20 = [0,1,53,0,4,5,6,7,54,0,10,11,12,13,55,0,16,17,18,19,56,0,22,23,24,25,57,0,28,29,30,31]
10621; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm13, %zmm20, %zmm6
10622; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm25, %zmm16 {%k1}
10623; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm22 = [0,1,2,3,48,0,6,7,8,9,49,0,12,13,14,15,50,0,18,19,20,21,51,0,24,25,26,27,52,0,30,31]
10624; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm13, %zmm22, %zmm9
10625; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm7, %zmm0 {%k2}
10626; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [42,0,2,3,4,5,43,0,8,9,10,11,44,0,14,15,16,17,45,0,20,21,22,23,46,0,26,27,28,29,47,0]
10627; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm13, %zmm7, %zmm14
10628; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm13, %zmm15, %zmm0
10629; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm15 = [0,1,37,0,4,5,6,7,38,0,10,11,12,13,39,0,16,17,18,19,40,0,22,23,24,25,41,0,28,29,30,31]
10630; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm13, %zmm15, %zmm16
10631; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm11, %zmm5 {%k1}
10632; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm12, %zmm19, %zmm5
10633; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm18, %zmm8 {%k1}
10634; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm12, %zmm20, %zmm8
10635; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm21, %zmm10 {%k2}
10636; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm11
10637; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm12, %zmm22, %zmm10
10638; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%r9), %zmm13
10639; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm24, %zmm17 {%k1}
10640; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm18 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31]
10641; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm11, %zmm18, %zmm2
10642; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm12, %zmm7, %zmm17
10643; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63]
10644; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm13, %zmm7, %zmm4
10645; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm3, %zmm1 {%k1}
10646; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31]
10647; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm13, %zmm3, %zmm6
10648; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm12, %zmm15, %zmm1
10649; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm12 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31]
10650; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm13, %zmm12, %zmm9
10651; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47]
10652; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm13, %zmm15, %zmm14
10653; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm13, %zmm18, %zmm0
10654; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm18 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31]
10655; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm13, %zmm18, %zmm16
10656; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm11, %zmm7, %zmm5
10657; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm11, %zmm3, %zmm8
10658; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm11, %zmm12, %zmm10
10659; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm11, %zmm15, %zmm17
10660; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm11, %zmm18, %zmm1
10661; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10662; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
10663; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, 128(%rax)
10664; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, 192(%rax)
10665; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, 256(%rax)
10666; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, 320(%rax)
10667; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 384(%rax)
10668; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, 448(%rax)
10669; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, 512(%rax)
10670; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, 576(%rax)
10671; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, 640(%rax)
10672; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, 704(%rax)
10673; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, (%rax)
10674; AVX512DQ-BW-FCP-NEXT:    vzeroupper
10675; AVX512DQ-BW-FCP-NEXT:    retq
10676  %in.vec0 = load <64 x i16>, ptr %in.vecptr0, align 64
10677  %in.vec1 = load <64 x i16>, ptr %in.vecptr1, align 64
10678  %in.vec2 = load <64 x i16>, ptr %in.vecptr2, align 64
10679  %in.vec3 = load <64 x i16>, ptr %in.vecptr3, align 64
10680  %in.vec4 = load <64 x i16>, ptr %in.vecptr4, align 64
10681  %in.vec5 = load <64 x i16>, ptr %in.vecptr5, align 64
10682  %1 = shufflevector <64 x i16> %in.vec0, <64 x i16> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
10683  %2 = shufflevector <64 x i16> %in.vec2, <64 x i16> %in.vec3, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
10684  %3 = shufflevector <64 x i16> %in.vec4, <64 x i16> %in.vec5, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
10685  %4 = shufflevector <128 x i16> %1, <128 x i16> %2, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255>
10686  %5 = shufflevector <128 x i16> %3, <128 x i16> poison, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
10687  %6 = shufflevector <256 x i16> %4, <256 x i16> %5, <384 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255, i32 256, i32 257, i32 258, i32 259, i32 260, i32 261, i32 262, i32 263, i32 264, i32 265, i32 266, i32 267, i32 268, i32 269, i32 270, i32 271, i32 272, i32 273, i32 274, i32 275, i32 276, i32 277, i32 278, i32 279, i32 280, i32 281, i32 282, i32 283, i32 284, i32 285, i32 286, i32 287, i32 288, i32 289, i32 290, i32 291, i32 292, i32 293, i32 294, i32 295, i32 296, i32 297, i32 298, i32 299, i32 300, i32 301, i32 302, i32 303, i32 304, i32 305, i32 306, i32 307, i32 308, i32 309, i32 310, i32 311, i32 312, i32 313, i32 314, i32 315, i32 316, i32 317, i32 318, i32 319, i32 320, i32 321, i32 322, i32 323, i32 324, i32 325, i32 326, i32 327, i32 328, i32 329, i32 330, i32 331, i32 332, i32 333, i32 334, i32 335, i32 336, i32 337, i32 338, i32 339, i32 340, i32 341, i32 342, i32 343, i32 344, i32 345, i32 346, i32 347, i32 348, i32 349, i32 350, i32 351, i32 352, i32 353, i32 354, i32 355, i32 356, i32 357, i32 358, i32 359, i32 360, i32 361, i32 362, i32 363, i32 364, i32 365, i32 366, i32 367, i32 368, i32 369, i32 370, i32 371, i32 372, i32 373, i32 374, i32 375, i32 376, i32 377, i32 378, i32 379, i32 380, i32 381, i32 382, i32 383>
10688  %interleaved.vec = shufflevector <384 x i16> %6, <384 x i16> poison, <384 x i32> <i32 0, i32 64, i32 128, i32 192, i32 256, i32 320, i32 1, i32 65, i32 129, i32 193, i32 257, i32 321, i32 2, i32 66, i32 130, i32 194, i32 258, i32 322, i32 3, i32 67, i32 131, i32 195, i32 259, i32 323, i32 4, i32 68, i32 132, i32 196, i32 260, i32 324, i32 5, i32 69, i32 133, i32 197, i32 261, i32 325, i32 6, i32 70, i32 134, i32 198, i32 262, i32 326, i32 7, i32 71, i32 135, i32 199, i32 263, i32 327, i32 8, i32 72, i32 136, i32 200, i32 264, i32 328, i32 9, i32 73, i32 137, i32 201, i32 265, i32 329, i32 10, i32 74, i32 138, i32 202, i32 266, i32 330, i32 11, i32 75, i32 139, i32 203, i32 267, i32 331, i32 12, i32 76, i32 140, i32 204, i32 268, i32 332, i32 13, i32 77, i32 141, i32 205, i32 269, i32 333, i32 14, i32 78, i32 142, i32 206, i32 270, i32 334, i32 15, i32 79, i32 143, i32 207, i32 271, i32 335, i32 16, i32 80, i32 144, i32 208, i32 272, i32 336, i32 17, i32 81, i32 145, i32 209, i32 273, i32 337, i32 18, i32 82, i32 146, i32 210, i32 274, i32 338, i32 19, i32 83, i32 147, i32 211, i32 275, i32 339, i32 20, i32 84, i32 148, i32 212, i32 276, i32 340, i32 21, i32 85, i32 149, i32 213, i32 277, i32 341, i32 22, i32 86, i32 150, i32 214, i32 278, i32 342, i32 23, i32 87, i32 151, i32 215, i32 279, i32 343, i32 24, i32 88, i32 152, i32 216, i32 280, i32 344, i32 25, i32 89, i32 153, i32 217, i32 281, i32 345, i32 26, i32 90, i32 154, i32 218, i32 282, i32 346, i32 27, i32 91, i32 155, i32 219, i32 283, i32 347, i32 28, i32 92, i32 156, i32 220, i32 284, i32 348, i32 29, i32 93, i32 157, i32 221, i32 285, i32 349, i32 30, i32 94, i32 158, i32 222, i32 286, i32 350, i32 31, i32 95, i32 159, i32 223, i32 287, i32 351, i32 32, i32 96, i32 160, i32 224, i32 288, i32 352, i32 33, i32 97, i32 161, i32 225, i32 289, i32 353, i32 34, i32 98, i32 162, i32 226, i32 290, i32 354, i32 35, i32 99, i32 163, i32 227, i32 291, i32 355, i32 36, i32 100, i32 164, i32 228, i32 292, i32 356, i32 37, i32 101, i32 165, i32 229, i32 293, i32 357, i32 38, i32 102, i32 166, i32 230, i32 294, i32 358, i32 39, i32 103, i32 167, i32 231, i32 295, i32 359, i32 40, i32 104, i32 168, i32 232, i32 296, i32 360, i32 41, i32 105, i32 169, i32 233, i32 297, i32 361, i32 42, i32 106, i32 170, i32 234, i32 298, i32 362, i32 43, i32 107, i32 171, i32 235, i32 299, i32 363, i32 44, i32 108, i32 172, i32 236, i32 300, i32 364, i32 45, i32 109, i32 173, i32 237, i32 301, i32 365, i32 46, i32 110, i32 174, i32 238, i32 302, i32 366, i32 47, i32 111, i32 175, i32 239, i32 303, i32 367, i32 48, i32 112, i32 176, i32 240, i32 304, i32 368, i32 49, i32 113, i32 177, i32 241, i32 305, i32 369, i32 50, i32 114, i32 178, i32 242, i32 306, i32 370, i32 51, i32 115, i32 179, i32 243, i32 307, i32 371, i32 52, i32 116, i32 180, i32 244, i32 308, i32 372, i32 53, i32 117, i32 181, i32 245, i32 309, i32 373, i32 54, i32 118, i32 182, i32 246, i32 310, i32 374, i32 55, i32 119, i32 183, i32 247, i32 311, i32 375, i32 56, i32 120, i32 184, i32 248, i32 312, i32 376, i32 57, i32 121, i32 185, i32 249, i32 313, i32 377, i32 58, i32 122, i32 186, i32 250, i32 314, i32 378, i32 59, i32 123, i32 187, i32 251, i32 315, i32 379, i32 60, i32 124, i32 188, i32 252, i32 316, i32 380, i32 61, i32 125, i32 189, i32 253, i32 317, i32 381, i32 62, i32 126, i32 190, i32 254, i32 318, i32 382, i32 63, i32 127, i32 191, i32 255, i32 319, i32 383>
10689  store <384 x i16> %interleaved.vec, ptr %out.vec, align 64
10690  ret void
10691}
10692