xref: /llvm-project/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll (revision be6c752e157638849f1f59f7e2b7ecbe11a022fe)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx  | FileCheck %s --check-prefixes=AVX
4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
15
16; These patterns are produced by LoopVectorizer for interleaved stores.
17
18define void @store_i32_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind {
19; SSE-LABEL: store_i32_stride6_vf2:
20; SSE:       # %bb.0:
21; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
22; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
23; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
24; SSE-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
25; SSE-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
26; SSE-NEXT:    movaps %xmm0, %xmm4
27; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0]
28; SSE-NEXT:    movaps %xmm2, %xmm5
29; SSE-NEXT:    movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0]
30; SSE-NEXT:    movsd {{.*#+}} xmm6 = mem[0],zero
31; SSE-NEXT:    movsd {{.*#+}} xmm7 = mem[0],zero
32; SSE-NEXT:    movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0]
33; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
34; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
35; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
36; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,3],xmm7[1,3]
37; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,2],xmm4[1,3]
38; SSE-NEXT:    movaps %xmm5, 32(%rax)
39; SSE-NEXT:    movaps %xmm7, 16(%rax)
40; SSE-NEXT:    movaps %xmm0, (%rax)
41; SSE-NEXT:    retq
42;
43; AVX-LABEL: store_i32_stride6_vf2:
44; AVX:       # %bb.0:
45; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
46; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
47; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
48; AVX-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
49; AVX-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
50; AVX-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
51; AVX-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0]
52; AVX-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
53; AVX-NEXT:    vmovsd {{.*#+}} xmm5 = mem[0],zero
54; AVX-NEXT:    vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0]
55; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
56; AVX-NEXT:    vpermilps {{.*#+}} ymm2 = ymm2[u,u,0,2,u,u,5,7]
57; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
58; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
59; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[0,2,2,3]
60; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
61; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
62; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm3[1,3],xmm4[1,3]
63; AVX-NEXT:    vmovaps %xmm1, 32(%rax)
64; AVX-NEXT:    vmovaps %ymm0, (%rax)
65; AVX-NEXT:    vzeroupper
66; AVX-NEXT:    retq
67;
68; AVX2-LABEL: store_i32_stride6_vf2:
69; AVX2:       # %bb.0:
70; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
71; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
72; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
73; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
74; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
75; AVX2-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
76; AVX2-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
77; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
78; AVX2-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
79; AVX2-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
80; AVX2-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
81; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
82; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
83; AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[0,2,2,3]
84; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
85; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7]
86; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
87; AVX2-NEXT:    vmovaps %xmm1, 32(%rax)
88; AVX2-NEXT:    vmovaps %ymm0, (%rax)
89; AVX2-NEXT:    vzeroupper
90; AVX2-NEXT:    retq
91;
92; AVX2-FP-LABEL: store_i32_stride6_vf2:
93; AVX2-FP:       # %bb.0:
94; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
95; AVX2-FP-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
96; AVX2-FP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
97; AVX2-FP-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
98; AVX2-FP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
99; AVX2-FP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
100; AVX2-FP-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
101; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
102; AVX2-FP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
103; AVX2-FP-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
104; AVX2-FP-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
105; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
106; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
107; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[0,2,2,3]
108; AVX2-FP-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
109; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7]
110; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
111; AVX2-FP-NEXT:    vmovaps %xmm1, 32(%rax)
112; AVX2-FP-NEXT:    vmovaps %ymm0, (%rax)
113; AVX2-FP-NEXT:    vzeroupper
114; AVX2-FP-NEXT:    retq
115;
116; AVX2-FCP-LABEL: store_i32_stride6_vf2:
117; AVX2-FCP:       # %bb.0:
118; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
119; AVX2-FCP-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
120; AVX2-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
121; AVX2-FCP-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
122; AVX2-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
123; AVX2-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
124; AVX2-FCP-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
125; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
126; AVX2-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
127; AVX2-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
128; AVX2-FCP-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0]
129; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm3 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
130; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3]
131; AVX2-FCP-NEXT:    vmovaps {{.*#+}} ymm4 = [0,2,4,6,u,u,1,3]
132; AVX2-FCP-NEXT:    vpermps %ymm0, %ymm4, %ymm0
133; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
134; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
135; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
136; AVX2-FCP-NEXT:    vmovaps %ymm0, (%rax)
137; AVX2-FCP-NEXT:    vmovaps %xmm3, 32(%rax)
138; AVX2-FCP-NEXT:    vzeroupper
139; AVX2-FCP-NEXT:    retq
140;
141; AVX512-LABEL: store_i32_stride6_vf2:
142; AVX512:       # %bb.0:
143; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
144; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
145; AVX512-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
146; AVX512-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
147; AVX512-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
148; AVX512-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
149; AVX512-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
150; AVX512-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
151; AVX512-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
152; AVX512-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
153; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
154; AVX512-NEXT:    vinsertf32x4 $2, %xmm2, %zmm0, %zmm0
155; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
156; AVX512-NEXT:    vpermps %zmm0, %zmm1, %zmm0
157; AVX512-NEXT:    vextractf32x4 $2, %zmm0, 32(%rax)
158; AVX512-NEXT:    vmovaps %ymm0, (%rax)
159; AVX512-NEXT:    vzeroupper
160; AVX512-NEXT:    retq
161;
162; AVX512-FCP-LABEL: store_i32_stride6_vf2:
163; AVX512-FCP:       # %bb.0:
164; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
165; AVX512-FCP-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
166; AVX512-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
167; AVX512-FCP-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
168; AVX512-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
169; AVX512-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
170; AVX512-FCP-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
171; AVX512-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
172; AVX512-FCP-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
173; AVX512-FCP-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
174; AVX512-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
175; AVX512-FCP-NEXT:    vinsertf32x4 $2, %xmm2, %zmm0, %zmm0
176; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
177; AVX512-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
178; AVX512-FCP-NEXT:    vextractf32x4 $2, %zmm0, 32(%rax)
179; AVX512-FCP-NEXT:    vmovaps %ymm0, (%rax)
180; AVX512-FCP-NEXT:    vzeroupper
181; AVX512-FCP-NEXT:    retq
182;
183; AVX512DQ-LABEL: store_i32_stride6_vf2:
184; AVX512DQ:       # %bb.0:
185; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
186; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
187; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
188; AVX512DQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
189; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
190; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
191; AVX512DQ-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
192; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
193; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
194; AVX512DQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
195; AVX512DQ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
196; AVX512DQ-NEXT:    vinsertf32x4 $2, %xmm2, %zmm0, %zmm0
197; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
198; AVX512DQ-NEXT:    vpermps %zmm0, %zmm1, %zmm0
199; AVX512DQ-NEXT:    vextractf32x4 $2, %zmm0, 32(%rax)
200; AVX512DQ-NEXT:    vmovaps %ymm0, (%rax)
201; AVX512DQ-NEXT:    vzeroupper
202; AVX512DQ-NEXT:    retq
203;
204; AVX512DQ-FCP-LABEL: store_i32_stride6_vf2:
205; AVX512DQ-FCP:       # %bb.0:
206; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
207; AVX512DQ-FCP-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
208; AVX512DQ-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
209; AVX512DQ-FCP-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
210; AVX512DQ-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
211; AVX512DQ-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
212; AVX512DQ-FCP-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
213; AVX512DQ-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
214; AVX512DQ-FCP-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
215; AVX512DQ-FCP-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
216; AVX512DQ-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
217; AVX512DQ-FCP-NEXT:    vinsertf32x4 $2, %xmm2, %zmm0, %zmm0
218; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
219; AVX512DQ-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
220; AVX512DQ-FCP-NEXT:    vextractf32x4 $2, %zmm0, 32(%rax)
221; AVX512DQ-FCP-NEXT:    vmovaps %ymm0, (%rax)
222; AVX512DQ-FCP-NEXT:    vzeroupper
223; AVX512DQ-FCP-NEXT:    retq
224;
225; AVX512BW-LABEL: store_i32_stride6_vf2:
226; AVX512BW:       # %bb.0:
227; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
228; AVX512BW-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
229; AVX512BW-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
230; AVX512BW-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
231; AVX512BW-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
232; AVX512BW-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
233; AVX512BW-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
234; AVX512BW-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
235; AVX512BW-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
236; AVX512BW-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
237; AVX512BW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
238; AVX512BW-NEXT:    vinsertf32x4 $2, %xmm2, %zmm0, %zmm0
239; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
240; AVX512BW-NEXT:    vpermps %zmm0, %zmm1, %zmm0
241; AVX512BW-NEXT:    vextractf32x4 $2, %zmm0, 32(%rax)
242; AVX512BW-NEXT:    vmovaps %ymm0, (%rax)
243; AVX512BW-NEXT:    vzeroupper
244; AVX512BW-NEXT:    retq
245;
246; AVX512BW-FCP-LABEL: store_i32_stride6_vf2:
247; AVX512BW-FCP:       # %bb.0:
248; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
249; AVX512BW-FCP-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
250; AVX512BW-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
251; AVX512BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
252; AVX512BW-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
253; AVX512BW-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
254; AVX512BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
255; AVX512BW-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
256; AVX512BW-FCP-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
257; AVX512BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
258; AVX512BW-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
259; AVX512BW-FCP-NEXT:    vinsertf32x4 $2, %xmm2, %zmm0, %zmm0
260; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
261; AVX512BW-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
262; AVX512BW-FCP-NEXT:    vextractf32x4 $2, %zmm0, 32(%rax)
263; AVX512BW-FCP-NEXT:    vmovaps %ymm0, (%rax)
264; AVX512BW-FCP-NEXT:    vzeroupper
265; AVX512BW-FCP-NEXT:    retq
266;
267; AVX512DQ-BW-LABEL: store_i32_stride6_vf2:
268; AVX512DQ-BW:       # %bb.0:
269; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
270; AVX512DQ-BW-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
271; AVX512DQ-BW-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
272; AVX512DQ-BW-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
273; AVX512DQ-BW-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
274; AVX512DQ-BW-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
275; AVX512DQ-BW-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
276; AVX512DQ-BW-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
277; AVX512DQ-BW-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
278; AVX512DQ-BW-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
279; AVX512DQ-BW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
280; AVX512DQ-BW-NEXT:    vinsertf32x4 $2, %xmm2, %zmm0, %zmm0
281; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
282; AVX512DQ-BW-NEXT:    vpermps %zmm0, %zmm1, %zmm0
283; AVX512DQ-BW-NEXT:    vextractf32x4 $2, %zmm0, 32(%rax)
284; AVX512DQ-BW-NEXT:    vmovaps %ymm0, (%rax)
285; AVX512DQ-BW-NEXT:    vzeroupper
286; AVX512DQ-BW-NEXT:    retq
287;
288; AVX512DQ-BW-FCP-LABEL: store_i32_stride6_vf2:
289; AVX512DQ-BW-FCP:       # %bb.0:
290; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
291; AVX512DQ-BW-FCP-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
292; AVX512DQ-BW-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
293; AVX512DQ-BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
294; AVX512DQ-BW-FCP-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
295; AVX512DQ-BW-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
296; AVX512DQ-BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
297; AVX512DQ-BW-FCP-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
298; AVX512DQ-BW-FCP-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
299; AVX512DQ-BW-FCP-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
300; AVX512DQ-BW-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
301; AVX512DQ-BW-FCP-NEXT:    vinsertf32x4 $2, %xmm2, %zmm0, %zmm0
302; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
303; AVX512DQ-BW-FCP-NEXT:    vpermps %zmm0, %zmm1, %zmm0
304; AVX512DQ-BW-FCP-NEXT:    vextractf32x4 $2, %zmm0, 32(%rax)
305; AVX512DQ-BW-FCP-NEXT:    vmovaps %ymm0, (%rax)
306; AVX512DQ-BW-FCP-NEXT:    vzeroupper
307; AVX512DQ-BW-FCP-NEXT:    retq
308  %in.vec0 = load <2 x i32>, ptr %in.vecptr0, align 64
309  %in.vec1 = load <2 x i32>, ptr %in.vecptr1, align 64
310  %in.vec2 = load <2 x i32>, ptr %in.vecptr2, align 64
311  %in.vec3 = load <2 x i32>, ptr %in.vecptr3, align 64
312  %in.vec4 = load <2 x i32>, ptr %in.vecptr4, align 64
313  %in.vec5 = load <2 x i32>, ptr %in.vecptr5, align 64
314  %1 = shufflevector <2 x i32> %in.vec0, <2 x i32> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
315  %2 = shufflevector <2 x i32> %in.vec2, <2 x i32> %in.vec3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
316  %3 = shufflevector <2 x i32> %in.vec4, <2 x i32> %in.vec5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
317  %4 = shufflevector <4 x i32> %1, <4 x i32> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
318  %5 = shufflevector <4 x i32> %3, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
319  %6 = shufflevector <8 x i32> %4, <8 x i32> %5, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
320  %interleaved.vec = shufflevector <12 x i32> %6, <12 x i32> poison, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11>
321  store <12 x i32> %interleaved.vec, ptr %out.vec, align 64
322  ret void
323}
324
325define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind {
326; SSE-LABEL: store_i32_stride6_vf4:
327; SSE:       # %bb.0:
328; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
329; SSE-NEXT:    movaps (%rdi), %xmm0
330; SSE-NEXT:    movaps (%rsi), %xmm4
331; SSE-NEXT:    movaps (%rdx), %xmm1
332; SSE-NEXT:    movaps (%rcx), %xmm5
333; SSE-NEXT:    movaps (%r8), %xmm7
334; SSE-NEXT:    movaps (%r9), %xmm3
335; SSE-NEXT:    movaps %xmm1, %xmm2
336; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
337; SSE-NEXT:    movaps %xmm7, %xmm6
338; SSE-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
339; SSE-NEXT:    movaps %xmm7, %xmm8
340; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[3,3],xmm3[3,3]
341; SSE-NEXT:    movaps %xmm7, %xmm9
342; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[1,1],xmm3[1,1]
343; SSE-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0]
344; SSE-NEXT:    movaps %xmm0, %xmm7
345; SSE-NEXT:    unpcklps {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
346; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm7[2,3]
347; SSE-NEXT:    movlhps {{.*#+}} xmm7 = xmm7[0],xmm2[0]
348; SSE-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
349; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[2,3]
350; SSE-NEXT:    unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
351; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
352; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,3],xmm8[0,2]
353; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,3],xmm9[0,2]
354; SSE-NEXT:    movaps %xmm3, 16(%rax)
355; SSE-NEXT:    movaps %xmm2, 32(%rax)
356; SSE-NEXT:    movaps %xmm0, 48(%rax)
357; SSE-NEXT:    movaps %xmm1, 80(%rax)
358; SSE-NEXT:    movaps %xmm6, 64(%rax)
359; SSE-NEXT:    movaps %xmm7, (%rax)
360; SSE-NEXT:    retq
361;
362; AVX-LABEL: store_i32_stride6_vf4:
363; AVX:       # %bb.0:
364; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
365; AVX-NEXT:    vmovaps (%rdi), %xmm0
366; AVX-NEXT:    vmovaps (%rsi), %xmm2
367; AVX-NEXT:    vmovaps (%rdx), %xmm1
368; AVX-NEXT:    vmovaps (%rcx), %xmm3
369; AVX-NEXT:    vmovaps (%r8), %xmm4
370; AVX-NEXT:    vmovaps (%r9), %xmm5
371; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm6
372; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm7
373; AVX-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm8
374; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm9
375; AVX-NEXT:    vunpcklps {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5]
376; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm11
377; AVX-NEXT:    vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm11[0],ymm6[2],ymm11[2]
378; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5]
379; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm3[0,0],xmm1[0,0]
380; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0]
381; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7]
382; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7]
383; AVX-NEXT:    vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7]
384; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm11
385; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm11[1,2],ymm7[5,6],ymm11[5,6]
386; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5]
387; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7]
388; AVX-NEXT:    vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
389; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7]
390; AVX-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm9[1],ymm8[3],ymm9[3]
391; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
392; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
393; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,3],ymm2[3,3],ymm0[7,7],ymm2[7,7]
394; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6],ymm5[7]
395; AVX-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,7,5]
396; AVX-NEXT:    vmovaps %ymm0, 64(%rax)
397; AVX-NEXT:    vmovaps %ymm4, 32(%rax)
398; AVX-NEXT:    vmovaps %ymm10, (%rax)
399; AVX-NEXT:    vzeroupper
400; AVX-NEXT:    retq
401;
402; AVX2-LABEL: store_i32_stride6_vf4:
403; AVX2:       # %bb.0:
404; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
405; AVX2-NEXT:    vmovaps (%rdi), %xmm0
406; AVX2-NEXT:    vmovaps (%rsi), %xmm1
407; AVX2-NEXT:    vmovaps (%rdx), %xmm2
408; AVX2-NEXT:    vmovaps (%rcx), %xmm3
409; AVX2-NEXT:    vmovaps (%r8), %xmm4
410; AVX2-NEXT:    vmovaps (%r9), %xmm5
411; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm6
412; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm7
413; AVX2-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm8
414; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm9 = [0,4,0,4,0,4,0,4]
415; AVX2-NEXT:    vpermps %ymm7, %ymm9, %ymm10
416; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm11 = [0,4,1,5,0,4,1,5]
417; AVX2-NEXT:    # ymm11 = mem[0,1,0,1]
418; AVX2-NEXT:    vpermps %ymm6, %ymm11, %ymm11
419; AVX2-NEXT:    vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5,6,7]
420; AVX2-NEXT:    vpermps %ymm8, %ymm9, %ymm9
421; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5],ymm10[6,7]
422; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm10 = [2,6,2,6,2,6,2,6]
423; AVX2-NEXT:    vpermps %ymm6, %ymm10, %ymm6
424; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm10 = [1,5,2,6,1,5,2,6]
425; AVX2-NEXT:    # ymm10 = mem[0,1,0,1]
426; AVX2-NEXT:    vpermps %ymm7, %ymm10, %ymm7
427; AVX2-NEXT:    vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7]
428; AVX2-NEXT:    vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
429; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7]
430; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
431; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
432; AVX2-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
433; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
434; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2,6,3,7,2,6,3,7]
435; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
436; AVX2-NEXT:    vpermps %ymm8, %ymm1, %ymm1
437; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
438; AVX2-NEXT:    vmovaps %ymm0, 64(%rax)
439; AVX2-NEXT:    vmovaps %ymm4, 32(%rax)
440; AVX2-NEXT:    vmovaps %ymm9, (%rax)
441; AVX2-NEXT:    vzeroupper
442; AVX2-NEXT:    retq
443;
444; AVX2-FP-LABEL: store_i32_stride6_vf4:
445; AVX2-FP:       # %bb.0:
446; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
447; AVX2-FP-NEXT:    vmovaps (%rdi), %xmm0
448; AVX2-FP-NEXT:    vmovaps (%rsi), %xmm1
449; AVX2-FP-NEXT:    vmovaps (%rdx), %xmm2
450; AVX2-FP-NEXT:    vmovaps (%rcx), %xmm3
451; AVX2-FP-NEXT:    vmovaps (%r8), %xmm4
452; AVX2-FP-NEXT:    vmovaps (%r9), %xmm5
453; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm6
454; AVX2-FP-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm7
455; AVX2-FP-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm8
456; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm9 = [0,4,0,4,0,4,0,4]
457; AVX2-FP-NEXT:    vpermps %ymm7, %ymm9, %ymm10
458; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm11 = [0,4,1,5,0,4,1,5]
459; AVX2-FP-NEXT:    # ymm11 = mem[0,1,0,1]
460; AVX2-FP-NEXT:    vpermps %ymm6, %ymm11, %ymm11
461; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5,6,7]
462; AVX2-FP-NEXT:    vpermps %ymm8, %ymm9, %ymm9
463; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5],ymm10[6,7]
464; AVX2-FP-NEXT:    vbroadcastsd {{.*#+}} ymm10 = [2,6,2,6,2,6,2,6]
465; AVX2-FP-NEXT:    vpermps %ymm6, %ymm10, %ymm6
466; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm10 = [1,5,2,6,1,5,2,6]
467; AVX2-FP-NEXT:    # ymm10 = mem[0,1,0,1]
468; AVX2-FP-NEXT:    vpermps %ymm7, %ymm10, %ymm7
469; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7]
470; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
471; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7]
472; AVX2-FP-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
473; AVX2-FP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
474; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
475; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
476; AVX2-FP-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2,6,3,7,2,6,3,7]
477; AVX2-FP-NEXT:    # ymm1 = mem[0,1,0,1]
478; AVX2-FP-NEXT:    vpermps %ymm8, %ymm1, %ymm1
479; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
480; AVX2-FP-NEXT:    vmovaps %ymm0, 64(%rax)
481; AVX2-FP-NEXT:    vmovaps %ymm4, 32(%rax)
482; AVX2-FP-NEXT:    vmovaps %ymm9, (%rax)
483; AVX2-FP-NEXT:    vzeroupper
484; AVX2-FP-NEXT:    retq
485;
486; AVX2-FCP-LABEL: store_i32_stride6_vf4:
487; AVX2-FCP:       # %bb.0:
488; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
489; AVX2-FCP-NEXT:    vmovaps (%rdi), %xmm0
490; AVX2-FCP-NEXT:    vmovaps (%rsi), %xmm1
491; AVX2-FCP-NEXT:    vmovaps (%rdx), %xmm2
492; AVX2-FCP-NEXT:    vmovaps (%rcx), %xmm3
493; AVX2-FCP-NEXT:    vmovaps (%r8), %xmm4
494; AVX2-FCP-NEXT:    vmovaps (%r9), %xmm5
495; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm6
496; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm7
497; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm8
498; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm9 = [0,4,0,4,0,4,0,4]
499; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm9, %ymm10
500; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm11 = [0,4,1,5,0,4,1,5]
501; AVX2-FCP-NEXT:    # ymm11 = mem[0,1,0,1]
502; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm11, %ymm11
503; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5,6,7]
504; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm9, %ymm9
505; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5],ymm10[6,7]
506; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm10 = [2,6,2,6,2,6,2,6]
507; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm10, %ymm6
508; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm10 = [1,5,2,6,1,5,2,6]
509; AVX2-FCP-NEXT:    # ymm10 = mem[0,1,0,1]
510; AVX2-FCP-NEXT:    vpermps %ymm7, %ymm10, %ymm7
511; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7]
512; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
513; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7]
514; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
515; AVX2-FCP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
516; AVX2-FCP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
517; AVX2-FCP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
518; AVX2-FCP-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2,6,3,7,2,6,3,7]
519; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
520; AVX2-FCP-NEXT:    vpermps %ymm8, %ymm1, %ymm1
521; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
522; AVX2-FCP-NEXT:    vmovaps %ymm0, 64(%rax)
523; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%rax)
524; AVX2-FCP-NEXT:    vmovaps %ymm9, (%rax)
525; AVX2-FCP-NEXT:    vzeroupper
526; AVX2-FCP-NEXT:    retq
527;
528; AVX512-LABEL: store_i32_stride6_vf4:
529; AVX512:       # %bb.0:
530; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
531; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
532; AVX512-NEXT:    vmovdqa (%rdx), %xmm1
533; AVX512-NEXT:    vmovdqa (%r8), %xmm2
534; AVX512-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
535; AVX512-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
536; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
537; AVX512-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm1
538; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23]
539; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
540; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14]
541; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
542; AVX512-NEXT:    vmovdqa64 %zmm3, (%rax)
543; AVX512-NEXT:    vmovdqa %ymm2, 64(%rax)
544; AVX512-NEXT:    vzeroupper
545; AVX512-NEXT:    retq
546;
547; AVX512-FCP-LABEL: store_i32_stride6_vf4:
548; AVX512-FCP:       # %bb.0:
549; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
550; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm0
551; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm1
552; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm2
553; AVX512-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
554; AVX512-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
555; AVX512-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
556; AVX512-FCP-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm1
557; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23]
558; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
559; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14]
560; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
561; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
562; AVX512-FCP-NEXT:    vmovdqa %ymm2, 64(%rax)
563; AVX512-FCP-NEXT:    vzeroupper
564; AVX512-FCP-NEXT:    retq
565;
566; AVX512DQ-LABEL: store_i32_stride6_vf4:
567; AVX512DQ:       # %bb.0:
568; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
569; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
570; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm1
571; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm2
572; AVX512DQ-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
573; AVX512DQ-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
574; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
575; AVX512DQ-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm1
576; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23]
577; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
578; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14]
579; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
580; AVX512DQ-NEXT:    vmovdqa64 %zmm3, (%rax)
581; AVX512DQ-NEXT:    vmovdqa %ymm2, 64(%rax)
582; AVX512DQ-NEXT:    vzeroupper
583; AVX512DQ-NEXT:    retq
584;
585; AVX512DQ-FCP-LABEL: store_i32_stride6_vf4:
586; AVX512DQ-FCP:       # %bb.0:
587; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
588; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm0
589; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm1
590; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm2
591; AVX512DQ-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
592; AVX512DQ-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
593; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
594; AVX512DQ-FCP-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm1
595; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23]
596; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
597; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14]
598; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
599; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
600; AVX512DQ-FCP-NEXT:    vmovdqa %ymm2, 64(%rax)
601; AVX512DQ-FCP-NEXT:    vzeroupper
602; AVX512DQ-FCP-NEXT:    retq
603;
604; AVX512BW-LABEL: store_i32_stride6_vf4:
605; AVX512BW:       # %bb.0:
606; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
607; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
608; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm1
609; AVX512BW-NEXT:    vmovdqa (%r8), %xmm2
610; AVX512BW-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
611; AVX512BW-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
612; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
613; AVX512BW-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm1
614; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23]
615; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
616; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14]
617; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
618; AVX512BW-NEXT:    vmovdqa64 %zmm3, (%rax)
619; AVX512BW-NEXT:    vmovdqa %ymm2, 64(%rax)
620; AVX512BW-NEXT:    vzeroupper
621; AVX512BW-NEXT:    retq
622;
623; AVX512BW-FCP-LABEL: store_i32_stride6_vf4:
624; AVX512BW-FCP:       # %bb.0:
625; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
626; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
627; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %xmm1
628; AVX512BW-FCP-NEXT:    vmovdqa (%r8), %xmm2
629; AVX512BW-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
630; AVX512BW-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
631; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
632; AVX512BW-FCP-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm1
633; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23]
634; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
635; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14]
636; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
637; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
638; AVX512BW-FCP-NEXT:    vmovdqa %ymm2, 64(%rax)
639; AVX512BW-FCP-NEXT:    vzeroupper
640; AVX512BW-FCP-NEXT:    retq
641;
642; AVX512DQ-BW-LABEL: store_i32_stride6_vf4:
643; AVX512DQ-BW:       # %bb.0:
644; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
645; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm0
646; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm1
647; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %xmm2
648; AVX512DQ-BW-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
649; AVX512DQ-BW-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
650; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
651; AVX512DQ-BW-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm1
652; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23]
653; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
654; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14]
655; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
656; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, (%rax)
657; AVX512DQ-BW-NEXT:    vmovdqa %ymm2, 64(%rax)
658; AVX512DQ-BW-NEXT:    vzeroupper
659; AVX512DQ-BW-NEXT:    retq
660;
661; AVX512DQ-BW-FCP-LABEL: store_i32_stride6_vf4:
662; AVX512DQ-BW-FCP:       # %bb.0:
663; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
664; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm0
665; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %xmm1
666; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r8), %xmm2
667; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
668; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
669; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
670; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, (%r9), %zmm2, %zmm1
671; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [18,22,3,7,11,15,19,23]
672; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
673; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14]
674; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
675; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
676; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm2, 64(%rax)
677; AVX512DQ-BW-FCP-NEXT:    vzeroupper
678; AVX512DQ-BW-FCP-NEXT:    retq
679  %in.vec0 = load <4 x i32>, ptr %in.vecptr0, align 64
680  %in.vec1 = load <4 x i32>, ptr %in.vecptr1, align 64
681  %in.vec2 = load <4 x i32>, ptr %in.vecptr2, align 64
682  %in.vec3 = load <4 x i32>, ptr %in.vecptr3, align 64
683  %in.vec4 = load <4 x i32>, ptr %in.vecptr4, align 64
684  %in.vec5 = load <4 x i32>, ptr %in.vecptr5, align 64
685  %1 = shufflevector <4 x i32> %in.vec0, <4 x i32> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
686  %2 = shufflevector <4 x i32> %in.vec2, <4 x i32> %in.vec3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
687  %3 = shufflevector <4 x i32> %in.vec4, <4 x i32> %in.vec5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
688  %4 = shufflevector <8 x i32> %1, <8 x i32> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
689  %5 = shufflevector <8 x i32> %3, <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
690  %6 = shufflevector <16 x i32> %4, <16 x i32> %5, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
691  %interleaved.vec = shufflevector <24 x i32> %6, <24 x i32> poison, <24 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23>
692  store <24 x i32> %interleaved.vec, ptr %out.vec, align 64
693  ret void
694}
695
696define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind {
697; SSE-LABEL: store_i32_stride6_vf8:
698; SSE:       # %bb.0:
699; SSE-NEXT:    movaps (%rdi), %xmm4
700; SSE-NEXT:    movaps 16(%rdi), %xmm1
701; SSE-NEXT:    movaps (%rsi), %xmm0
702; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
703; SSE-NEXT:    movaps 16(%rsi), %xmm10
704; SSE-NEXT:    movaps (%rdx), %xmm8
705; SSE-NEXT:    movaps 16(%rdx), %xmm2
706; SSE-NEXT:    movaps (%rcx), %xmm6
707; SSE-NEXT:    movaps 16(%rcx), %xmm9
708; SSE-NEXT:    movaps (%r8), %xmm5
709; SSE-NEXT:    movaps 16(%r8), %xmm11
710; SSE-NEXT:    movaps (%r9), %xmm7
711; SSE-NEXT:    movaps 16(%r9), %xmm3
712; SSE-NEXT:    movaps %xmm9, %xmm14
713; SSE-NEXT:    unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1]
714; SSE-NEXT:    movaps %xmm1, %xmm12
715; SSE-NEXT:    unpckhps {{.*#+}} xmm12 = xmm12[2],xmm10[2],xmm12[3],xmm10[3]
716; SSE-NEXT:    movaps %xmm11, %xmm13
717; SSE-NEXT:    unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm3[1]
718; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[0,2],xmm12[2,3]
719; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[0,1],xmm14[2,0]
720; SSE-NEXT:    movaps %xmm11, %xmm14
721; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[1,1]
722; SSE-NEXT:    movaps %xmm2, %xmm15
723; SSE-NEXT:    unpcklps {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1]
724; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1]
725; SSE-NEXT:    movaps %xmm11, %xmm0
726; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3]
727; SSE-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm11[0]
728; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3]
729; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0]
730; SSE-NEXT:    movaps %xmm15, %xmm10
731; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[2,3],xmm14[0,2]
732; SSE-NEXT:    movaps %xmm5, %xmm14
733; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[3,3],xmm7[3,3]
734; SSE-NEXT:    movaps %xmm8, %xmm11
735; SSE-NEXT:    unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3]
736; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[2,3],xmm14[0,2]
737; SSE-NEXT:    movaps %xmm4, %xmm14
738; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
739; SSE-NEXT:    unpckhps {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3]
740; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1]
741; SSE-NEXT:    unpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3]
742; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,3],xmm0[0,2]
743; SSE-NEXT:    movaps %xmm8, %xmm0
744; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
745; SSE-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1]
746; SSE-NEXT:    movaps %xmm5, %xmm8
747; SSE-NEXT:    unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm7[1]
748; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,2],xmm14[2,3]
749; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[0,1],xmm6[2,0]
750; SSE-NEXT:    movaps %xmm7, %xmm6
751; SSE-NEXT:    movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0]
752; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1]
753; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3]
754; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
755; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3],xmm5[0,2]
756; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
757; SSE-NEXT:    movaps %xmm0, 32(%rax)
758; SSE-NEXT:    movaps %xmm14, 48(%rax)
759; SSE-NEXT:    movaps %xmm1, 96(%rax)
760; SSE-NEXT:    movaps %xmm3, 112(%rax)
761; SSE-NEXT:    movaps %xmm13, 160(%rax)
762; SSE-NEXT:    movaps %xmm2, 176(%rax)
763; SSE-NEXT:    movaps %xmm4, (%rax)
764; SSE-NEXT:    movaps %xmm6, 16(%rax)
765; SSE-NEXT:    movaps %xmm8, 64(%rax)
766; SSE-NEXT:    movaps %xmm11, 80(%rax)
767; SSE-NEXT:    movaps %xmm10, 128(%rax)
768; SSE-NEXT:    movaps %xmm12, 144(%rax)
769; SSE-NEXT:    retq
770;
771; AVX-LABEL: store_i32_stride6_vf8:
772; AVX:       # %bb.0:
773; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
774; AVX-NEXT:    vmovaps (%rdi), %ymm7
775; AVX-NEXT:    vmovaps (%rsi), %ymm8
776; AVX-NEXT:    vmovaps (%rdx), %ymm2
777; AVX-NEXT:    vmovaps (%rcx), %ymm3
778; AVX-NEXT:    vmovaps (%r8), %ymm1
779; AVX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5]
780; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
781; AVX-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
782; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm4
783; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0]
784; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7]
785; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
786; AVX-NEXT:    vbroadcastss 16(%r9), %ymm4
787; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
788; AVX-NEXT:    vmovaps (%rcx), %xmm9
789; AVX-NEXT:    vmovaps (%rdx), %xmm10
790; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm10[1,2],xmm9[1,2]
791; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[0,2,1,3]
792; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm4, %ymm4
793; AVX-NEXT:    vmovaps (%rsi), %xmm5
794; AVX-NEXT:    vmovaps (%rdi), %xmm6
795; AVX-NEXT:    vunpckhps {{.*#+}} xmm11 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
796; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm12
797; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm12[4,5],ymm4[6,7]
798; AVX-NEXT:    vbroadcastss 4(%r8), %xmm12
799; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3],ymm4[4,5,6,7]
800; AVX-NEXT:    vbroadcastss 4(%r9), %ymm12
801; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm12[3],ymm4[4,5,6,7]
802; AVX-NEXT:    vunpckhps {{.*#+}} ymm8 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7]
803; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm2[1,2],ymm3[1,2],ymm2[5,6],ymm3[5,6]
804; AVX-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm7[2,3,2,3]
805; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[0,2,1,3,4,6,5,7]
806; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7]
807; AVX-NEXT:    vbroadcastss 20(%r8), %xmm12
808; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm12[2,3],ymm7[4,5,6,7]
809; AVX-NEXT:    vbroadcastss 20(%r9), %ymm12
810; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3],ymm7[4,5,6,7]
811; AVX-NEXT:    vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
812; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3]
813; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm11, %ymm9
814; AVX-NEXT:    vpermilps {{.*#+}} xmm10 = mem[2,1,3,3]
815; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm10, %ymm10
816; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5],ymm10[6,7]
817; AVX-NEXT:    vmovaps (%r9), %xmm10
818; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm10[0,2,2,3]
819; AVX-NEXT:    vinsertf128 $1, %xmm10, %ymm11, %ymm10
820; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4,5,6],ymm10[7]
821; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[3,0],ymm3[7,4],ymm2[7,4]
822; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
823; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3]
824; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
825; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
826; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5],ymm1[6,7]
827; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = mem[2,3,2,3]
828; AVX-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
829; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7]
830; AVX-NEXT:    vbroadcastss (%rcx), %xmm2
831; AVX-NEXT:    vbroadcastss (%rdx), %xmm3
832; AVX-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
833; AVX-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
834; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm5
835; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6,7]
836; AVX-NEXT:    vinsertf128 $1, (%r8), %ymm3, %ymm3
837; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
838; AVX-NEXT:    vbroadcastss (%r9), %ymm3
839; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
840; AVX-NEXT:    vmovaps %ymm2, (%rax)
841; AVX-NEXT:    vmovaps %ymm1, 160(%rax)
842; AVX-NEXT:    vmovaps %ymm9, 64(%rax)
843; AVX-NEXT:    vmovaps %ymm7, 128(%rax)
844; AVX-NEXT:    vmovaps %ymm4, 32(%rax)
845; AVX-NEXT:    vmovaps %ymm0, 96(%rax)
846; AVX-NEXT:    vzeroupper
847; AVX-NEXT:    retq
848;
849; AVX2-LABEL: store_i32_stride6_vf8:
850; AVX2:       # %bb.0:
851; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
852; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
853; AVX2-NEXT:    vmovdqa (%rsi), %ymm1
854; AVX2-NEXT:    vmovdqa (%rdx), %ymm3
855; AVX2-NEXT:    vmovdqa (%rcx), %ymm4
856; AVX2-NEXT:    vmovdqa (%r8), %ymm2
857; AVX2-NEXT:    vmovdqa (%rsi), %xmm6
858; AVX2-NEXT:    vmovdqa (%rdi), %xmm11
859; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm11[2],xmm6[2],xmm11[3],xmm6[3]
860; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm5
861; AVX2-NEXT:    vmovdqa (%rcx), %xmm8
862; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm8[1,2,2,3]
863; AVX2-NEXT:    vmovdqa (%rdx), %xmm9
864; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm9[1,2,2,3]
865; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
866; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1]
867; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5],ymm10[6,7]
868; AVX2-NEXT:    vmovdqa (%r8), %xmm10
869; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm12 = xmm10[0],zero,xmm10[1],zero
870; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7]
871; AVX2-NEXT:    vpbroadcastd 4(%r9), %ymm12
872; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3],ymm5[4,5,6,7]
873; AVX2-NEXT:    vpbroadcastd (%rcx), %xmm12
874; AVX2-NEXT:    vpbroadcastd (%rdx), %xmm13
875; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
876; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm11[0],xmm6[0],xmm11[1],xmm6[1]
877; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1]
878; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm12[2,3],ymm6[4,5,6,7]
879; AVX2-NEXT:    vpbroadcastq %xmm10, %ymm11
880; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7]
881; AVX2-NEXT:    vmovdqa (%r9), %xmm12
882; AVX2-NEXT:    vpbroadcastd %xmm12, %ymm11
883; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm11[5],ymm6[6,7]
884; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
885; AVX2-NEXT:    vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6]
886; AVX2-NEXT:    vpshufd {{.*#+}} ymm14 = ymm3[1,1,2,3,5,5,6,7]
887; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4],ymm11[5],ymm14[6],ymm11[7]
888; AVX2-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3]
889; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7]
890; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero
891; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7]
892; AVX2-NEXT:    vpbroadcastd 20(%r9), %ymm14
893; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5,6,7]
894; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
895; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3]
896; AVX2-NEXT:    vinserti128 $1, %xmm8, %ymm7, %ymm7
897; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3]
898; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1]
899; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5],ymm8[6,7]
900; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm12[2,2,3,3]
901; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1]
902; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6],ymm8[7]
903; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
904; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7]
905; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm13[2,3],ymm8[2,3]
906; AVX2-NEXT:    vpshufd {{.*#+}} ymm9 = ymm2[2,1,3,3,6,5,7,7]
907; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3]
908; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5],ymm9[6,7]
909; AVX2-NEXT:    vpshufd {{.*#+}} ymm9 = mem[0,2,2,3,4,6,6,7]
910; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3]
911; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6],ymm9[7]
912; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5]
913; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
914; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
915; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
916; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7]
917; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
918; AVX2-NEXT:    vpbroadcastd 16(%r9), %ymm1
919; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
920; AVX2-NEXT:    vmovdqa %ymm0, 96(%rax)
921; AVX2-NEXT:    vmovdqa %ymm8, 160(%rax)
922; AVX2-NEXT:    vmovdqa %ymm7, 64(%rax)
923; AVX2-NEXT:    vmovdqa %ymm11, 128(%rax)
924; AVX2-NEXT:    vmovdqa %ymm6, (%rax)
925; AVX2-NEXT:    vmovdqa %ymm5, 32(%rax)
926; AVX2-NEXT:    vzeroupper
927; AVX2-NEXT:    retq
928;
929; AVX2-FP-LABEL: store_i32_stride6_vf8:
930; AVX2-FP:       # %bb.0:
931; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
932; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm0
933; AVX2-FP-NEXT:    vmovdqa (%rsi), %ymm1
934; AVX2-FP-NEXT:    vmovdqa (%rdx), %ymm3
935; AVX2-FP-NEXT:    vmovdqa (%rcx), %ymm4
936; AVX2-FP-NEXT:    vmovdqa (%r8), %ymm2
937; AVX2-FP-NEXT:    vmovdqa (%rsi), %xmm6
938; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm11
939; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm11[2],xmm6[2],xmm11[3],xmm6[3]
940; AVX2-FP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm5
941; AVX2-FP-NEXT:    vmovdqa (%rcx), %xmm8
942; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm8[1,2,2,3]
943; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm9
944; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm9[1,2,2,3]
945; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
946; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1]
947; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5],ymm10[6,7]
948; AVX2-FP-NEXT:    vmovdqa (%r8), %xmm10
949; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm12 = xmm10[0],zero,xmm10[1],zero
950; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7]
951; AVX2-FP-NEXT:    vpbroadcastd 4(%r9), %ymm12
952; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3],ymm5[4,5,6,7]
953; AVX2-FP-NEXT:    vpbroadcastd (%rcx), %xmm12
954; AVX2-FP-NEXT:    vpbroadcastd (%rdx), %xmm13
955; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
956; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm11[0],xmm6[0],xmm11[1],xmm6[1]
957; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1]
958; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm12[2,3],ymm6[4,5,6,7]
959; AVX2-FP-NEXT:    vpbroadcastq %xmm10, %ymm11
960; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7]
961; AVX2-FP-NEXT:    vmovdqa (%r9), %xmm12
962; AVX2-FP-NEXT:    vpbroadcastd %xmm12, %ymm11
963; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm11[5],ymm6[6,7]
964; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
965; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6]
966; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm3[1,1,2,3,5,5,6,7]
967; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4],ymm11[5],ymm14[6],ymm11[7]
968; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3]
969; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7]
970; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero
971; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7]
972; AVX2-FP-NEXT:    vpbroadcastd 20(%r9), %ymm14
973; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5,6,7]
974; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
975; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3]
976; AVX2-FP-NEXT:    vinserti128 $1, %xmm8, %ymm7, %ymm7
977; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3]
978; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1]
979; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5],ymm8[6,7]
980; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm12[2,2,3,3]
981; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1]
982; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6],ymm8[7]
983; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
984; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7]
985; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm13[2,3],ymm8[2,3]
986; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm2[2,1,3,3,6,5,7,7]
987; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3]
988; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5],ymm9[6,7]
989; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm9 = mem[0,2,2,3,4,6,6,7]
990; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3]
991; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6],ymm9[7]
992; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5]
993; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
994; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
995; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
996; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7]
997; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
998; AVX2-FP-NEXT:    vpbroadcastd 16(%r9), %ymm1
999; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
1000; AVX2-FP-NEXT:    vmovdqa %ymm0, 96(%rax)
1001; AVX2-FP-NEXT:    vmovdqa %ymm8, 160(%rax)
1002; AVX2-FP-NEXT:    vmovdqa %ymm7, 64(%rax)
1003; AVX2-FP-NEXT:    vmovdqa %ymm11, 128(%rax)
1004; AVX2-FP-NEXT:    vmovdqa %ymm6, (%rax)
1005; AVX2-FP-NEXT:    vmovdqa %ymm5, 32(%rax)
1006; AVX2-FP-NEXT:    vzeroupper
1007; AVX2-FP-NEXT:    retq
1008;
1009; AVX2-FCP-LABEL: store_i32_stride6_vf8:
1010; AVX2-FCP:       # %bb.0:
1011; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1012; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm0
1013; AVX2-FCP-NEXT:    vmovdqa (%rsi), %ymm1
1014; AVX2-FCP-NEXT:    vmovdqa (%rdx), %ymm3
1015; AVX2-FCP-NEXT:    vmovdqa (%rcx), %ymm4
1016; AVX2-FCP-NEXT:    vmovdqa (%r8), %ymm2
1017; AVX2-FCP-NEXT:    vmovdqa (%r9), %ymm5
1018; AVX2-FCP-NEXT:    vmovdqa (%rsi), %xmm7
1019; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm11
1020; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm11[2],xmm7[2],xmm11[3],xmm7[3]
1021; AVX2-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm6
1022; AVX2-FCP-NEXT:    vmovdqa (%rcx), %xmm9
1023; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm9[1,2,2,3]
1024; AVX2-FCP-NEXT:    vmovdqa (%rdx), %xmm10
1025; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm10[1,2,2,3]
1026; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
1027; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1]
1028; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5],ymm12[6,7]
1029; AVX2-FCP-NEXT:    vmovdqa (%r8), %xmm12
1030; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm13 = xmm12[0],zero,xmm12[1],zero
1031; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7]
1032; AVX2-FCP-NEXT:    vpbroadcastd 4(%r9), %ymm13
1033; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3],ymm6[4,5,6,7]
1034; AVX2-FCP-NEXT:    vpbroadcastd (%rcx), %xmm13
1035; AVX2-FCP-NEXT:    vpbroadcastd (%rdx), %xmm14
1036; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
1037; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1]
1038; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1]
1039; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm13[2,3],ymm7[4,5,6,7]
1040; AVX2-FCP-NEXT:    vpbroadcastq %xmm12, %ymm11
1041; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5],ymm7[6,7]
1042; AVX2-FCP-NEXT:    vpbroadcastd (%r9), %ymm11
1043; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm11[5],ymm7[6,7]
1044; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
1045; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,2,4,5,6,6]
1046; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm3[1,1,2,3,5,5,6,7]
1047; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4],ymm12[5],ymm13[6],ymm12[7]
1048; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3]
1049; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5],ymm12[6,7]
1050; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero
1051; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7]
1052; AVX2-FCP-NEXT:    vpbroadcastd 20(%r9), %ymm13
1053; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7]
1054; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
1055; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[2,3,2,3]
1056; AVX2-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm8, %ymm8
1057; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3]
1058; AVX2-FCP-NEXT:    # ymm9 = mem[0,1,0,1]
1059; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm9, %ymm10
1060; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3,4,5],ymm10[6,7]
1061; AVX2-FCP-NEXT:    vpermd %ymm5, %ymm9, %ymm9
1062; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6],ymm9[7]
1063; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm9 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
1064; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
1065; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3]
1066; AVX2-FCP-NEXT:    vpmovsxbq {{.*#+}} ymm10 = [6,0,0,7]
1067; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm10, %ymm10
1068; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5],ymm10[6,7]
1069; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,6,0,0,0,0,0,7]
1070; AVX2-FCP-NEXT:    vpermd %ymm5, %ymm10, %ymm5
1071; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2,3,4,5,6],ymm5[7]
1072; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5]
1073; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2]
1074; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
1075; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
1076; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7]
1077; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
1078; AVX2-FCP-NEXT:    vpbroadcastd 16(%r9), %ymm1
1079; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
1080; AVX2-FCP-NEXT:    vmovdqa %ymm0, 96(%rax)
1081; AVX2-FCP-NEXT:    vmovdqa %ymm5, 160(%rax)
1082; AVX2-FCP-NEXT:    vmovdqa %ymm8, 64(%rax)
1083; AVX2-FCP-NEXT:    vmovdqa %ymm12, 128(%rax)
1084; AVX2-FCP-NEXT:    vmovdqa %ymm7, (%rax)
1085; AVX2-FCP-NEXT:    vmovdqa %ymm6, 32(%rax)
1086; AVX2-FCP-NEXT:    vzeroupper
1087; AVX2-FCP-NEXT:    retq
1088;
1089; AVX512-LABEL: store_i32_stride6_vf8:
1090; AVX512:       # %bb.0:
1091; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1092; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
1093; AVX512-NEXT:    vmovdqa (%rdx), %ymm1
1094; AVX512-NEXT:    vmovdqa (%r8), %ymm2
1095; AVX512-NEXT:    vinserti64x4 $1, (%rsi), %zmm0, %zmm0
1096; AVX512-NEXT:    vinserti64x4 $1, (%rcx), %zmm1, %zmm1
1097; AVX512-NEXT:    vinserti64x4 $1, (%r9), %zmm2, %zmm2
1098; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26]
1099; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1100; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15]
1101; AVX512-NEXT:    vpermi2d %zmm2, %zmm3, %zmm4
1102; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13]
1103; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1104; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15]
1105; AVX512-NEXT:    vpermi2d %zmm2, %zmm3, %zmm5
1106; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0]
1107; AVX512-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
1108; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31]
1109; AVX512-NEXT:    vpermi2d %zmm2, %zmm3, %zmm0
1110; AVX512-NEXT:    vmovdqa64 %zmm0, 128(%rax)
1111; AVX512-NEXT:    vmovdqa64 %zmm5, 64(%rax)
1112; AVX512-NEXT:    vmovdqa64 %zmm4, (%rax)
1113; AVX512-NEXT:    vzeroupper
1114; AVX512-NEXT:    retq
1115;
1116; AVX512-FCP-LABEL: store_i32_stride6_vf8:
1117; AVX512-FCP:       # %bb.0:
1118; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1119; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm0
1120; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm1
1121; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm2
1122; AVX512-FCP-NEXT:    vinserti64x4 $1, (%rsi), %zmm0, %zmm0
1123; AVX512-FCP-NEXT:    vinserti64x4 $1, (%rcx), %zmm1, %zmm1
1124; AVX512-FCP-NEXT:    vinserti64x4 $1, (%r9), %zmm2, %zmm2
1125; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26]
1126; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1127; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15]
1128; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm4
1129; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13]
1130; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1131; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15]
1132; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm5
1133; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0]
1134; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
1135; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31]
1136; AVX512-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm0
1137; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
1138; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rax)
1139; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
1140; AVX512-FCP-NEXT:    vzeroupper
1141; AVX512-FCP-NEXT:    retq
1142;
1143; AVX512DQ-LABEL: store_i32_stride6_vf8:
1144; AVX512DQ:       # %bb.0:
1145; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1146; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
1147; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm1
1148; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm2
1149; AVX512DQ-NEXT:    vinserti64x4 $1, (%rsi), %zmm0, %zmm0
1150; AVX512DQ-NEXT:    vinserti64x4 $1, (%rcx), %zmm1, %zmm1
1151; AVX512DQ-NEXT:    vinserti64x4 $1, (%r9), %zmm2, %zmm2
1152; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26]
1153; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1154; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15]
1155; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm3, %zmm4
1156; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13]
1157; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1158; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15]
1159; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm3, %zmm5
1160; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0]
1161; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
1162; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31]
1163; AVX512DQ-NEXT:    vpermi2d %zmm2, %zmm3, %zmm0
1164; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 128(%rax)
1165; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 64(%rax)
1166; AVX512DQ-NEXT:    vmovdqa64 %zmm4, (%rax)
1167; AVX512DQ-NEXT:    vzeroupper
1168; AVX512DQ-NEXT:    retq
1169;
1170; AVX512DQ-FCP-LABEL: store_i32_stride6_vf8:
1171; AVX512DQ-FCP:       # %bb.0:
1172; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1173; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm0
1174; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm1
1175; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm2
1176; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, (%rsi), %zmm0, %zmm0
1177; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, (%rcx), %zmm1, %zmm1
1178; AVX512DQ-FCP-NEXT:    vinserti64x4 $1, (%r9), %zmm2, %zmm2
1179; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26]
1180; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1181; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15]
1182; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm4
1183; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13]
1184; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1185; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15]
1186; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm5
1187; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0]
1188; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
1189; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31]
1190; AVX512DQ-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm0
1191; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
1192; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rax)
1193; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
1194; AVX512DQ-FCP-NEXT:    vzeroupper
1195; AVX512DQ-FCP-NEXT:    retq
1196;
1197; AVX512BW-LABEL: store_i32_stride6_vf8:
1198; AVX512BW:       # %bb.0:
1199; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1200; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
1201; AVX512BW-NEXT:    vmovdqa (%rdx), %ymm1
1202; AVX512BW-NEXT:    vmovdqa (%r8), %ymm2
1203; AVX512BW-NEXT:    vinserti64x4 $1, (%rsi), %zmm0, %zmm0
1204; AVX512BW-NEXT:    vinserti64x4 $1, (%rcx), %zmm1, %zmm1
1205; AVX512BW-NEXT:    vinserti64x4 $1, (%r9), %zmm2, %zmm2
1206; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26]
1207; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1208; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15]
1209; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm3, %zmm4
1210; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13]
1211; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1212; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15]
1213; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm3, %zmm5
1214; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0]
1215; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
1216; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31]
1217; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm3, %zmm0
1218; AVX512BW-NEXT:    vmovdqa64 %zmm0, 128(%rax)
1219; AVX512BW-NEXT:    vmovdqa64 %zmm5, 64(%rax)
1220; AVX512BW-NEXT:    vmovdqa64 %zmm4, (%rax)
1221; AVX512BW-NEXT:    vzeroupper
1222; AVX512BW-NEXT:    retq
1223;
1224; AVX512BW-FCP-LABEL: store_i32_stride6_vf8:
1225; AVX512BW-FCP:       # %bb.0:
1226; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1227; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm0
1228; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %ymm1
1229; AVX512BW-FCP-NEXT:    vmovdqa (%r8), %ymm2
1230; AVX512BW-FCP-NEXT:    vinserti64x4 $1, (%rsi), %zmm0, %zmm0
1231; AVX512BW-FCP-NEXT:    vinserti64x4 $1, (%rcx), %zmm1, %zmm1
1232; AVX512BW-FCP-NEXT:    vinserti64x4 $1, (%r9), %zmm2, %zmm2
1233; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26]
1234; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1235; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15]
1236; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm4
1237; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13]
1238; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1239; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15]
1240; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm5
1241; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0]
1242; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
1243; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31]
1244; AVX512BW-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm0
1245; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
1246; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rax)
1247; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
1248; AVX512BW-FCP-NEXT:    vzeroupper
1249; AVX512BW-FCP-NEXT:    retq
1250;
1251; AVX512DQ-BW-LABEL: store_i32_stride6_vf8:
1252; AVX512DQ-BW:       # %bb.0:
1253; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1254; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm0
1255; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %ymm1
1256; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %ymm2
1257; AVX512DQ-BW-NEXT:    vinserti64x4 $1, (%rsi), %zmm0, %zmm0
1258; AVX512DQ-BW-NEXT:    vinserti64x4 $1, (%rcx), %zmm1, %zmm1
1259; AVX512DQ-BW-NEXT:    vinserti64x4 $1, (%r9), %zmm2, %zmm2
1260; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26]
1261; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1262; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15]
1263; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm3, %zmm4
1264; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13]
1265; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1266; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15]
1267; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm3, %zmm5
1268; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0]
1269; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
1270; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31]
1271; AVX512DQ-BW-NEXT:    vpermi2d %zmm2, %zmm3, %zmm0
1272; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, 128(%rax)
1273; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, 64(%rax)
1274; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, (%rax)
1275; AVX512DQ-BW-NEXT:    vzeroupper
1276; AVX512DQ-BW-NEXT:    retq
1277;
1278; AVX512DQ-BW-FCP-LABEL: store_i32_stride6_vf8:
1279; AVX512DQ-BW-FCP:       # %bb.0:
1280; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1281; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm0
1282; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %ymm1
1283; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r8), %ymm2
1284; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, (%rsi), %zmm0, %zmm0
1285; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, (%rcx), %zmm1, %zmm1
1286; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, (%r9), %zmm2, %zmm2
1287; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,8,16,24,0,0,1,9,17,25,0,0,2,10,18,26]
1288; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1289; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15]
1290; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm4
1291; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,0,3,11,19,27,0,0,4,12,20,28,0,0,5,13]
1292; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
1293; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15]
1294; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm5
1295; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [5,13,0,0,22,30,6,14,0,0,23,31,7,15,0,0]
1296; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
1297; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31]
1298; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm2, %zmm3, %zmm0
1299; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
1300; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rax)
1301; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
1302; AVX512DQ-BW-FCP-NEXT:    vzeroupper
1303; AVX512DQ-BW-FCP-NEXT:    retq
1304  %in.vec0 = load <8 x i32>, ptr %in.vecptr0, align 64
1305  %in.vec1 = load <8 x i32>, ptr %in.vecptr1, align 64
1306  %in.vec2 = load <8 x i32>, ptr %in.vecptr2, align 64
1307  %in.vec3 = load <8 x i32>, ptr %in.vecptr3, align 64
1308  %in.vec4 = load <8 x i32>, ptr %in.vecptr4, align 64
1309  %in.vec5 = load <8 x i32>, ptr %in.vecptr5, align 64
1310  %1 = shufflevector <8 x i32> %in.vec0, <8 x i32> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1311  %2 = shufflevector <8 x i32> %in.vec2, <8 x i32> %in.vec3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1312  %3 = shufflevector <8 x i32> %in.vec4, <8 x i32> %in.vec5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1313  %4 = shufflevector <16 x i32> %1, <16 x i32> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1314  %5 = shufflevector <16 x i32> %3, <16 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1315  %6 = shufflevector <32 x i32> %4, <32 x i32> %5, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
1316  %interleaved.vec = shufflevector <48 x i32> %6, <48 x i32> poison, <48 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 7, i32 15, i32 23, i32 31, i32 39, i32 47>
1317  store <48 x i32> %interleaved.vec, ptr %out.vec, align 64
1318  ret void
1319}
1320
1321define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind {
1322; SSE-LABEL: store_i32_stride6_vf16:
1323; SSE:       # %bb.0:
1324; SSE-NEXT:    subq $72, %rsp
1325; SSE-NEXT:    movaps (%rdi), %xmm7
1326; SSE-NEXT:    movaps 16(%rdi), %xmm8
1327; SSE-NEXT:    movaps (%rsi), %xmm2
1328; SSE-NEXT:    movaps 16(%rsi), %xmm6
1329; SSE-NEXT:    movaps (%rdx), %xmm9
1330; SSE-NEXT:    movaps 16(%rdx), %xmm10
1331; SSE-NEXT:    movaps (%rcx), %xmm1
1332; SSE-NEXT:    movaps 16(%rcx), %xmm0
1333; SSE-NEXT:    movaps (%r8), %xmm3
1334; SSE-NEXT:    movaps 16(%r8), %xmm14
1335; SSE-NEXT:    movaps (%r9), %xmm4
1336; SSE-NEXT:    movaps 16(%r9), %xmm13
1337; SSE-NEXT:    movaps %xmm9, %xmm11
1338; SSE-NEXT:    unpcklps {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1]
1339; SSE-NEXT:    movaps %xmm7, %xmm5
1340; SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
1341; SSE-NEXT:    movaps %xmm4, %xmm12
1342; SSE-NEXT:    movlhps {{.*#+}} xmm12 = xmm12[0],xmm3[0]
1343; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[2,0],xmm5[2,3]
1344; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1345; SSE-NEXT:    movlhps {{.*#+}} xmm5 = xmm5[0],xmm11[0]
1346; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1347; SSE-NEXT:    movaps %xmm3, %xmm5
1348; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1],xmm4[1,1]
1349; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[0,2]
1350; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1351; SSE-NEXT:    movaps %xmm1, %xmm5
1352; SSE-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm9[1]
1353; SSE-NEXT:    unpckhps {{.*#+}} xmm7 = xmm7[2],xmm2[2],xmm7[3],xmm2[3]
1354; SSE-NEXT:    movaps %xmm3, %xmm2
1355; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1]
1356; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[2,3]
1357; SSE-NEXT:    movaps %xmm2, (%rsp) # 16-byte Spill
1358; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[2,0]
1359; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1360; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3],xmm4[3,3]
1361; SSE-NEXT:    unpckhps {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3]
1362; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[2,3],xmm3[0,2]
1363; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1364; SSE-NEXT:    movaps %xmm10, %xmm2
1365; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1366; SSE-NEXT:    movaps %xmm8, %xmm1
1367; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
1368; SSE-NEXT:    movaps %xmm13, %xmm3
1369; SSE-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm14[0]
1370; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3]
1371; SSE-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1372; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1373; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1374; SSE-NEXT:    movaps %xmm14, %xmm1
1375; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm13[1,1]
1376; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,3],xmm1[0,2]
1377; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1378; SSE-NEXT:    movaps %xmm0, %xmm1
1379; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1]
1380; SSE-NEXT:    unpckhps {{.*#+}} xmm8 = xmm8[2],xmm6[2],xmm8[3],xmm6[3]
1381; SSE-NEXT:    movaps %xmm14, %xmm2
1382; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm13[1]
1383; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm8[2,3]
1384; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1385; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,0]
1386; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1387; SSE-NEXT:    movaps 32(%rdi), %xmm12
1388; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[3,3],xmm13[3,3]
1389; SSE-NEXT:    movaps 32(%rdx), %xmm13
1390; SSE-NEXT:    unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3]
1391; SSE-NEXT:    movaps 32(%rcx), %xmm0
1392; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[2,3],xmm14[0,2]
1393; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1394; SSE-NEXT:    movaps %xmm13, %xmm14
1395; SSE-NEXT:    unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
1396; SSE-NEXT:    movaps 32(%rsi), %xmm1
1397; SSE-NEXT:    movaps %xmm12, %xmm15
1398; SSE-NEXT:    unpcklps {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1]
1399; SSE-NEXT:    movaps 32(%r8), %xmm2
1400; SSE-NEXT:    movaps 32(%r9), %xmm3
1401; SSE-NEXT:    movaps %xmm3, %xmm11
1402; SSE-NEXT:    movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0]
1403; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[2,0],xmm15[2,3]
1404; SSE-NEXT:    movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0]
1405; SSE-NEXT:    movaps %xmm2, %xmm4
1406; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1]
1407; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[2,3],xmm4[0,2]
1408; SSE-NEXT:    unpckhps {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3]
1409; SSE-NEXT:    movaps %xmm0, %xmm1
1410; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1]
1411; SSE-NEXT:    movaps %xmm2, %xmm8
1412; SSE-NEXT:    unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1]
1413; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,2],xmm12[2,3]
1414; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[2,0]
1415; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3]
1416; SSE-NEXT:    unpckhps {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3]
1417; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[2,3],xmm2[0,2]
1418; SSE-NEXT:    movaps 48(%rdx), %xmm3
1419; SSE-NEXT:    movaps 48(%rcx), %xmm10
1420; SSE-NEXT:    movaps %xmm3, %xmm5
1421; SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1]
1422; SSE-NEXT:    movaps 48(%rdi), %xmm2
1423; SSE-NEXT:    movaps 48(%rsi), %xmm9
1424; SSE-NEXT:    movaps %xmm2, %xmm4
1425; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1]
1426; SSE-NEXT:    movaps 48(%r8), %xmm1
1427; SSE-NEXT:    movaps 48(%r9), %xmm7
1428; SSE-NEXT:    movaps %xmm7, %xmm6
1429; SSE-NEXT:    movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0]
1430; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3]
1431; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
1432; SSE-NEXT:    movaps %xmm1, %xmm0
1433; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[1,1]
1434; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2]
1435; SSE-NEXT:    unpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3]
1436; SSE-NEXT:    movaps %xmm10, %xmm0
1437; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
1438; SSE-NEXT:    movaps %xmm1, %xmm9
1439; SSE-NEXT:    unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm7[1]
1440; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[2,3]
1441; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
1442; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3]
1443; SSE-NEXT:    unpckhps {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3]
1444; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[0,2]
1445; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1446; SSE-NEXT:    movaps %xmm3, 368(%rax)
1447; SSE-NEXT:    movaps %xmm9, 352(%rax)
1448; SSE-NEXT:    movaps %xmm2, 336(%rax)
1449; SSE-NEXT:    movaps %xmm5, 320(%rax)
1450; SSE-NEXT:    movaps %xmm6, 304(%rax)
1451; SSE-NEXT:    movaps %xmm4, 288(%rax)
1452; SSE-NEXT:    movaps %xmm13, 272(%rax)
1453; SSE-NEXT:    movaps %xmm8, 256(%rax)
1454; SSE-NEXT:    movaps %xmm12, 240(%rax)
1455; SSE-NEXT:    movaps %xmm14, 224(%rax)
1456; SSE-NEXT:    movaps %xmm11, 208(%rax)
1457; SSE-NEXT:    movaps %xmm15, 192(%rax)
1458; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1459; SSE-NEXT:    movaps %xmm0, 176(%rax)
1460; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1461; SSE-NEXT:    movaps %xmm0, 160(%rax)
1462; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1463; SSE-NEXT:    movaps %xmm0, 144(%rax)
1464; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1465; SSE-NEXT:    movaps %xmm0, 128(%rax)
1466; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1467; SSE-NEXT:    movaps %xmm0, 112(%rax)
1468; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1469; SSE-NEXT:    movaps %xmm0, 96(%rax)
1470; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1471; SSE-NEXT:    movaps %xmm0, 80(%rax)
1472; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
1473; SSE-NEXT:    movaps %xmm0, 64(%rax)
1474; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1475; SSE-NEXT:    movaps %xmm0, 48(%rax)
1476; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1477; SSE-NEXT:    movaps %xmm0, 32(%rax)
1478; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1479; SSE-NEXT:    movaps %xmm0, 16(%rax)
1480; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1481; SSE-NEXT:    movaps %xmm0, (%rax)
1482; SSE-NEXT:    addq $72, %rsp
1483; SSE-NEXT:    retq
1484;
1485; AVX-LABEL: store_i32_stride6_vf16:
1486; AVX:       # %bb.0:
1487; AVX-NEXT:    subq $104, %rsp
1488; AVX-NEXT:    vmovaps 32(%rdi), %ymm5
1489; AVX-NEXT:    vmovaps 32(%rsi), %ymm13
1490; AVX-NEXT:    vmovaps 32(%rdx), %ymm7
1491; AVX-NEXT:    vmovaps 32(%rcx), %ymm9
1492; AVX-NEXT:    vmovaps 32(%r8), %ymm11
1493; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1494; AVX-NEXT:    vmovaps (%rcx), %xmm8
1495; AVX-NEXT:    vmovaps 32(%rcx), %xmm3
1496; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1497; AVX-NEXT:    vmovaps (%rdx), %xmm6
1498; AVX-NEXT:    vmovaps 32(%rdx), %xmm10
1499; AVX-NEXT:    vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1500; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm6[1,2],xmm8[1,2]
1501; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
1502; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1503; AVX-NEXT:    vmovaps (%rsi), %xmm1
1504; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1505; AVX-NEXT:    vmovaps (%rdi), %xmm2
1506; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1507; AVX-NEXT:    vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1508; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm1
1509; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
1510; AVX-NEXT:    vbroadcastss 4(%r8), %xmm1
1511; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
1512; AVX-NEXT:    vbroadcastss 4(%r9), %ymm1
1513; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
1514; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1515; AVX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm13[0],ymm5[1],ymm13[1],ymm5[4],ymm13[4],ymm5[5],ymm13[5]
1516; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
1517; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
1518; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
1519; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
1520; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
1521; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7]
1522; AVX-NEXT:    vbroadcastss 48(%r9), %ymm1
1523; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
1524; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1525; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm10[1,2],xmm3[1,2]
1526; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
1527; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1528; AVX-NEXT:    vmovaps 32(%rsi), %xmm3
1529; AVX-NEXT:    vmovaps 32(%rdi), %xmm2
1530; AVX-NEXT:    vunpckhps {{.*#+}} xmm15 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1531; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm11
1532; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7]
1533; AVX-NEXT:    vbroadcastss 36(%r8), %xmm11
1534; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm11[2,3],ymm0[4,5,6,7]
1535; AVX-NEXT:    vbroadcastss 36(%r9), %ymm11
1536; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3],ymm0[4,5,6,7]
1537; AVX-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
1538; AVX-NEXT:    vmovaps (%rdi), %ymm1
1539; AVX-NEXT:    vmovaps (%rsi), %ymm0
1540; AVX-NEXT:    vunpcklps {{.*#+}} ymm11 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
1541; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm11[2,3,2,3]
1542; AVX-NEXT:    vmovaps (%rdx), %ymm11
1543; AVX-NEXT:    vmovaps (%rcx), %ymm12
1544; AVX-NEXT:    vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm11[0],ymm12[2],ymm11[2]
1545; AVX-NEXT:    vextractf128 $1, %ymm10, %xmm10
1546; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm10[0,1,2,0]
1547; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3],ymm14[4,5,6,7]
1548; AVX-NEXT:    vmovaps (%r8), %ymm14
1549; AVX-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1550; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5],ymm10[6,7]
1551; AVX-NEXT:    vbroadcastss 16(%r9), %ymm14
1552; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm10[0,1,2,3,4],ymm14[5],ymm10[6,7]
1553; AVX-NEXT:    vunpckhps {{.*#+}} ymm13 = ymm5[2],ymm13[2],ymm5[3],ymm13[3],ymm5[6],ymm13[6],ymm5[7],ymm13[7]
1554; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm7[1,2],ymm9[1,2],ymm7[5,6],ymm9[5,6]
1555; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3]
1556; AVX-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[0,2,1,3,4,6,5,7]
1557; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5],ymm5[6,7]
1558; AVX-NEXT:    vbroadcastss 52(%r8), %xmm10
1559; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7]
1560; AVX-NEXT:    vbroadcastss 52(%r9), %ymm10
1561; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm10[3],ymm5[4,5,6,7]
1562; AVX-NEXT:    vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
1563; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm11[1,2],ymm12[1,2],ymm11[5,6],ymm12[5,6]
1564; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
1565; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
1566; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7]
1567; AVX-NEXT:    vbroadcastss 20(%r8), %xmm1
1568; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
1569; AVX-NEXT:    vbroadcastss 20(%r9), %ymm1
1570; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
1571; AVX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm8[2],xmm6[3],xmm8[3]
1572; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
1573; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
1574; AVX-NEXT:    vpermilps {{.*#+}} xmm4 = mem[2,1,3,3]
1575; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm4, %ymm4
1576; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7]
1577; AVX-NEXT:    vmovaps (%r9), %xmm4
1578; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm4[0,2,2,3]
1579; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
1580; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0],ymm4[1],ymm0[2,3,4,5,6],ymm4[7]
1581; AVX-NEXT:    vbroadcastss 32(%rcx), %xmm0
1582; AVX-NEXT:    vbroadcastss 32(%rdx), %xmm6
1583; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
1584; AVX-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1585; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm3
1586; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7]
1587; AVX-NEXT:    vinsertf128 $1, 32(%r8), %ymm2, %ymm2
1588; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
1589; AVX-NEXT:    vbroadcastss 32(%r9), %ymm2
1590; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
1591; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm9[3,0],ymm7[3,0],ymm9[7,4],ymm7[7,4]
1592; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
1593; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3]
1594; AVX-NEXT:    vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
1595; AVX-NEXT:    # ymm3 = mem[2,3,2,3]
1596; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7]
1597; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7]
1598; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = mem[2,3,2,3]
1599; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
1600; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4,5,6],ymm3[7]
1601; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1602; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
1603; AVX-NEXT:    # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
1604; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3]
1605; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm15, %ymm3
1606; AVX-NEXT:    vpermilps {{.*#+}} xmm6 = mem[2,1,3,3]
1607; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm6, %ymm6
1608; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5],ymm6[6,7]
1609; AVX-NEXT:    vmovaps 32(%r9), %xmm6
1610; AVX-NEXT:    vshufps {{.*#+}} xmm7 = xmm6[0,2,2,3]
1611; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm6
1612; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4,5,6],ymm6[7]
1613; AVX-NEXT:    vbroadcastss (%rcx), %xmm6
1614; AVX-NEXT:    vbroadcastss (%rdx), %xmm7
1615; AVX-NEXT:    vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
1616; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
1617; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
1618; AVX-NEXT:    # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1]
1619; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm8
1620; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7]
1621; AVX-NEXT:    vinsertf128 $1, (%r8), %ymm7, %ymm7
1622; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7]
1623; AVX-NEXT:    vbroadcastss (%r9), %ymm7
1624; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
1625; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm12[3,0],ymm11[3,0],ymm12[7,4],ymm11[7,4]
1626; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7]
1627; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3]
1628; AVX-NEXT:    vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
1629; AVX-NEXT:    # ymm7 = mem[2,3,2,3]
1630; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7]
1631; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3,4,5],ymm7[6,7]
1632; AVX-NEXT:    vperm2f128 {{.*#+}} ymm7 = mem[2,3,2,3]
1633; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7]
1634; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4,5,6],ymm7[7]
1635; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1636; AVX-NEXT:    vmovaps %ymm5, 160(%rax)
1637; AVX-NEXT:    vmovaps %ymm6, (%rax)
1638; AVX-NEXT:    vmovaps %ymm3, 256(%rax)
1639; AVX-NEXT:    vmovaps %ymm0, 352(%rax)
1640; AVX-NEXT:    vmovaps %ymm2, 192(%rax)
1641; AVX-NEXT:    vmovaps %ymm4, 64(%rax)
1642; AVX-NEXT:    vmovaps %ymm1, 128(%rax)
1643; AVX-NEXT:    vmovaps %ymm10, 320(%rax)
1644; AVX-NEXT:    vmovaps %ymm14, 96(%rax)
1645; AVX-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
1646; AVX-NEXT:    vmovaps %ymm0, 224(%rax)
1647; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1648; AVX-NEXT:    vmovaps %ymm0, 288(%rax)
1649; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1650; AVX-NEXT:    vmovaps %ymm0, 32(%rax)
1651; AVX-NEXT:    addq $104, %rsp
1652; AVX-NEXT:    vzeroupper
1653; AVX-NEXT:    retq
1654;
1655; AVX2-LABEL: store_i32_stride6_vf16:
1656; AVX2:       # %bb.0:
1657; AVX2-NEXT:    subq $200, %rsp
1658; AVX2-NEXT:    vmovdqa (%rsi), %xmm12
1659; AVX2-NEXT:    vmovdqa 32(%rsi), %xmm1
1660; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
1661; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm2
1662; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm12[2],xmm0[3],xmm12[3]
1663; AVX2-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
1664; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
1665; AVX2-NEXT:    vmovdqa (%rcx), %xmm4
1666; AVX2-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1667; AVX2-NEXT:    vmovdqa 32(%rcx), %xmm7
1668; AVX2-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1669; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3]
1670; AVX2-NEXT:    vmovdqa (%rdx), %xmm5
1671; AVX2-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1672; AVX2-NEXT:    vmovdqa 32(%rdx), %xmm8
1673; AVX2-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1674; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3]
1675; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
1676; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
1677; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7]
1678; AVX2-NEXT:    vmovdqa (%r8), %xmm13
1679; AVX2-NEXT:    vmovdqa 32(%r8), %xmm6
1680; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm13[0],zero,xmm13[1],zero
1681; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7]
1682; AVX2-NEXT:    vpbroadcastd 4(%r9), %ymm4
1683; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7]
1684; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1685; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1686; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1687; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
1688; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3]
1689; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm8[1,2,2,3]
1690; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
1691; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
1692; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7]
1693; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero
1694; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7]
1695; AVX2-NEXT:    vpbroadcastd 36(%r9), %ymm4
1696; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7]
1697; AVX2-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1698; AVX2-NEXT:    vpbroadcastd 32(%rcx), %xmm3
1699; AVX2-NEXT:    vpbroadcastd 32(%rdx), %xmm4
1700; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
1701; AVX2-NEXT:    vmovdqa 32(%rdx), %ymm9
1702; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1703; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
1704; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7]
1705; AVX2-NEXT:    vpbroadcastq %xmm6, %ymm2
1706; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
1707; AVX2-NEXT:    vmovdqa 32(%r9), %xmm15
1708; AVX2-NEXT:    vpbroadcastd %xmm15, %ymm2
1709; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
1710; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1711; AVX2-NEXT:    vmovdqa 32(%rcx), %ymm14
1712; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[0,1,2,2,4,5,6,6]
1713; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm9[1,1,2,3,5,5,6,7]
1714; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
1715; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
1716; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm10
1717; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm8
1718; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm4 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7]
1719; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7]
1720; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero
1721; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7]
1722; AVX2-NEXT:    vpbroadcastd 52(%r9), %ymm3
1723; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7]
1724; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1725; AVX2-NEXT:    vpbroadcastd (%rcx), %xmm1
1726; AVX2-NEXT:    vpbroadcastd (%rdx), %xmm3
1727; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1728; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
1729; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
1730; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
1731; AVX2-NEXT:    vpbroadcastq %xmm13, %ymm1
1732; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
1733; AVX2-NEXT:    vmovdqa (%r9), %xmm1
1734; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm2
1735; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
1736; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1737; AVX2-NEXT:    vmovdqa (%rdx), %ymm2
1738; AVX2-NEXT:    vmovdqa (%rcx), %ymm0
1739; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,2,4,5,6,6]
1740; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[1,1,2,3,5,5,6,7]
1741; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7]
1742; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm3[2,1,2,3]
1743; AVX2-NEXT:    vmovdqa (%rdi), %ymm7
1744; AVX2-NEXT:    vmovdqa (%rsi), %ymm5
1745; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm3 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7]
1746; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7]
1747; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero
1748; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7]
1749; AVX2-NEXT:    vpbroadcastd 20(%r9), %ymm12
1750; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7]
1751; AVX2-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1752; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
1753; AVX2-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
1754; AVX2-NEXT:    # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3]
1755; AVX2-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[2,3,2,3]
1756; AVX2-NEXT:    vmovdqu (%rsp), %ymm12 # 32-byte Reload
1757; AVX2-NEXT:    vinserti128 $1, %xmm11, %ymm12, %ymm11
1758; AVX2-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3]
1759; AVX2-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1]
1760; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3,4,5],ymm13[6,7]
1761; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
1762; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
1763; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3,4,5,6],ymm1[7]
1764; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm11 = ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[6],ymm14[6],ymm9[7],ymm14[7]
1765; AVX2-NEXT:    vpshufd {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7]
1766; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm11[2,3]
1767; AVX2-NEXT:    vmovdqa 32(%r8), %ymm11
1768; AVX2-NEXT:    vpshufd {{.*#+}} ymm13 = ymm11[2,1,3,3,6,5,7,7]
1769; AVX2-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3]
1770; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm4[2,3,4,5],ymm13[6,7]
1771; AVX2-NEXT:    vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7]
1772; AVX2-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3]
1773; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4,5,6],ymm13[7]
1774; AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
1775; AVX2-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm13 # 16-byte Folded Reload
1776; AVX2-NEXT:    # xmm13 = xmm12[2],mem[2],xmm12[3],mem[3]
1777; AVX2-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3]
1778; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
1779; AVX2-NEXT:    vinserti128 $1, %xmm13, %ymm12, %ymm13
1780; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3]
1781; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1]
1782; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5],ymm6[6,7]
1783; AVX2-NEXT:    vpshufd {{.*#+}} xmm13 = xmm15[2,2,3,3]
1784; AVX2-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1]
1785; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4,5,6],ymm13[7]
1786; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm9 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[4],ymm14[4],ymm9[5],ymm14[5]
1787; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5]
1788; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2]
1789; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3]
1790; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7]
1791; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5],ymm8[6,7]
1792; AVX2-NEXT:    vpbroadcastd 48(%r9), %ymm9
1793; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7]
1794; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm9 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7]
1795; AVX2-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
1796; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm9[2,3]
1797; AVX2-NEXT:    vmovdqa (%r8), %ymm9
1798; AVX2-NEXT:    vpshufd {{.*#+}} ymm10 = ymm9[2,1,3,3,6,5,7,7]
1799; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3]
1800; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3,4,5],ymm10[6,7]
1801; AVX2-NEXT:    vpshufd {{.*#+}} ymm10 = mem[0,2,2,3,4,6,6,7]
1802; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3]
1803; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7]
1804; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5]
1805; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm2 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5]
1806; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
1807; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
1808; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
1809; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7]
1810; AVX2-NEXT:    vpbroadcastd 16(%r9), %ymm2
1811; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
1812; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1813; AVX2-NEXT:    vmovdqa %ymm0, 96(%rax)
1814; AVX2-NEXT:    vmovdqa %ymm3, 160(%rax)
1815; AVX2-NEXT:    vmovdqa %ymm8, 288(%rax)
1816; AVX2-NEXT:    vmovdqa %ymm6, 256(%rax)
1817; AVX2-NEXT:    vmovdqa %ymm4, 352(%rax)
1818; AVX2-NEXT:    vmovdqa %ymm1, 64(%rax)
1819; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1820; AVX2-NEXT:    vmovaps %ymm0, 128(%rax)
1821; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1822; AVX2-NEXT:    vmovaps %ymm0, (%rax)
1823; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1824; AVX2-NEXT:    vmovaps %ymm0, 320(%rax)
1825; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1826; AVX2-NEXT:    vmovaps %ymm0, 192(%rax)
1827; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1828; AVX2-NEXT:    vmovaps %ymm0, 224(%rax)
1829; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1830; AVX2-NEXT:    vmovaps %ymm0, 32(%rax)
1831; AVX2-NEXT:    addq $200, %rsp
1832; AVX2-NEXT:    vzeroupper
1833; AVX2-NEXT:    retq
1834;
1835; AVX2-FP-LABEL: store_i32_stride6_vf16:
1836; AVX2-FP:       # %bb.0:
1837; AVX2-FP-NEXT:    subq $200, %rsp
1838; AVX2-FP-NEXT:    vmovdqa (%rsi), %xmm12
1839; AVX2-FP-NEXT:    vmovdqa 32(%rsi), %xmm1
1840; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm0
1841; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm2
1842; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm12[2],xmm0[3],xmm12[3]
1843; AVX2-FP-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
1844; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
1845; AVX2-FP-NEXT:    vmovdqa (%rcx), %xmm4
1846; AVX2-FP-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1847; AVX2-FP-NEXT:    vmovdqa 32(%rcx), %xmm7
1848; AVX2-FP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1849; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3]
1850; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm5
1851; AVX2-FP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1852; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %xmm8
1853; AVX2-FP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1854; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3]
1855; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
1856; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
1857; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7]
1858; AVX2-FP-NEXT:    vmovdqa (%r8), %xmm13
1859; AVX2-FP-NEXT:    vmovdqa 32(%r8), %xmm6
1860; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm13[0],zero,xmm13[1],zero
1861; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7]
1862; AVX2-FP-NEXT:    vpbroadcastd 4(%r9), %ymm4
1863; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7]
1864; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1865; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1866; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1867; AVX2-FP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
1868; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3]
1869; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm8[1,2,2,3]
1870; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
1871; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
1872; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7]
1873; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero
1874; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7]
1875; AVX2-FP-NEXT:    vpbroadcastd 36(%r9), %ymm4
1876; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7]
1877; AVX2-FP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1878; AVX2-FP-NEXT:    vpbroadcastd 32(%rcx), %xmm3
1879; AVX2-FP-NEXT:    vpbroadcastd 32(%rdx), %xmm4
1880; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
1881; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %ymm9
1882; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1883; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
1884; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7]
1885; AVX2-FP-NEXT:    vpbroadcastq %xmm6, %ymm2
1886; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
1887; AVX2-FP-NEXT:    vmovdqa 32(%r9), %xmm15
1888; AVX2-FP-NEXT:    vpbroadcastd %xmm15, %ymm2
1889; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
1890; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1891; AVX2-FP-NEXT:    vmovdqa 32(%rcx), %ymm14
1892; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[0,1,2,2,4,5,6,6]
1893; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm9[1,1,2,3,5,5,6,7]
1894; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
1895; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
1896; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm10
1897; AVX2-FP-NEXT:    vmovdqa 32(%rsi), %ymm8
1898; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm4 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7]
1899; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7]
1900; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero
1901; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7]
1902; AVX2-FP-NEXT:    vpbroadcastd 52(%r9), %ymm3
1903; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7]
1904; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1905; AVX2-FP-NEXT:    vpbroadcastd (%rcx), %xmm1
1906; AVX2-FP-NEXT:    vpbroadcastd (%rdx), %xmm3
1907; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1908; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
1909; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
1910; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
1911; AVX2-FP-NEXT:    vpbroadcastq %xmm13, %ymm1
1912; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
1913; AVX2-FP-NEXT:    vmovdqa (%r9), %xmm1
1914; AVX2-FP-NEXT:    vpbroadcastd %xmm1, %ymm2
1915; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
1916; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1917; AVX2-FP-NEXT:    vmovdqa (%rdx), %ymm2
1918; AVX2-FP-NEXT:    vmovdqa (%rcx), %ymm0
1919; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,2,4,5,6,6]
1920; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[1,1,2,3,5,5,6,7]
1921; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7]
1922; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm3[2,1,2,3]
1923; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm7
1924; AVX2-FP-NEXT:    vmovdqa (%rsi), %ymm5
1925; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm3 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7]
1926; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7]
1927; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero
1928; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7]
1929; AVX2-FP-NEXT:    vpbroadcastd 20(%r9), %ymm12
1930; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7]
1931; AVX2-FP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1932; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
1933; AVX2-FP-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
1934; AVX2-FP-NEXT:    # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3]
1935; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[2,3,2,3]
1936; AVX2-FP-NEXT:    vmovdqu (%rsp), %ymm12 # 32-byte Reload
1937; AVX2-FP-NEXT:    vinserti128 $1, %xmm11, %ymm12, %ymm11
1938; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3]
1939; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1]
1940; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3,4,5],ymm13[6,7]
1941; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
1942; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
1943; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3,4,5,6],ymm1[7]
1944; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm11 = ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[6],ymm14[6],ymm9[7],ymm14[7]
1945; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7]
1946; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm11[2,3]
1947; AVX2-FP-NEXT:    vmovdqa 32(%r8), %ymm11
1948; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm11[2,1,3,3,6,5,7,7]
1949; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3]
1950; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm4[2,3,4,5],ymm13[6,7]
1951; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7]
1952; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3]
1953; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4,5,6],ymm13[7]
1954; AVX2-FP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
1955; AVX2-FP-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm13 # 16-byte Folded Reload
1956; AVX2-FP-NEXT:    # xmm13 = xmm12[2],mem[2],xmm12[3],mem[3]
1957; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3]
1958; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
1959; AVX2-FP-NEXT:    vinserti128 $1, %xmm13, %ymm12, %ymm13
1960; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3]
1961; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1]
1962; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5],ymm6[6,7]
1963; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm15[2,2,3,3]
1964; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1]
1965; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4,5,6],ymm13[7]
1966; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm9 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[4],ymm14[4],ymm9[5],ymm14[5]
1967; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5]
1968; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2]
1969; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3]
1970; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7]
1971; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5],ymm8[6,7]
1972; AVX2-FP-NEXT:    vpbroadcastd 48(%r9), %ymm9
1973; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7]
1974; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm9 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7]
1975; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
1976; AVX2-FP-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm9[2,3]
1977; AVX2-FP-NEXT:    vmovdqa (%r8), %ymm9
1978; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm9[2,1,3,3,6,5,7,7]
1979; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3]
1980; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3,4,5],ymm10[6,7]
1981; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm10 = mem[0,2,2,3,4,6,6,7]
1982; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3]
1983; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7]
1984; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5]
1985; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm2 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5]
1986; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
1987; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
1988; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
1989; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7]
1990; AVX2-FP-NEXT:    vpbroadcastd 16(%r9), %ymm2
1991; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
1992; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
1993; AVX2-FP-NEXT:    vmovdqa %ymm0, 96(%rax)
1994; AVX2-FP-NEXT:    vmovdqa %ymm3, 160(%rax)
1995; AVX2-FP-NEXT:    vmovdqa %ymm8, 288(%rax)
1996; AVX2-FP-NEXT:    vmovdqa %ymm6, 256(%rax)
1997; AVX2-FP-NEXT:    vmovdqa %ymm4, 352(%rax)
1998; AVX2-FP-NEXT:    vmovdqa %ymm1, 64(%rax)
1999; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2000; AVX2-FP-NEXT:    vmovaps %ymm0, 128(%rax)
2001; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2002; AVX2-FP-NEXT:    vmovaps %ymm0, (%rax)
2003; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2004; AVX2-FP-NEXT:    vmovaps %ymm0, 320(%rax)
2005; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2006; AVX2-FP-NEXT:    vmovaps %ymm0, 192(%rax)
2007; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2008; AVX2-FP-NEXT:    vmovaps %ymm0, 224(%rax)
2009; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2010; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%rax)
2011; AVX2-FP-NEXT:    addq $200, %rsp
2012; AVX2-FP-NEXT:    vzeroupper
2013; AVX2-FP-NEXT:    retq
2014;
2015; AVX2-FCP-LABEL: store_i32_stride6_vf16:
2016; AVX2-FCP:       # %bb.0:
2017; AVX2-FCP-NEXT:    subq $232, %rsp
2018; AVX2-FCP-NEXT:    vmovdqa (%rsi), %xmm12
2019; AVX2-FCP-NEXT:    vmovdqa 32(%rsi), %xmm1
2020; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm15
2021; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm2
2022; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm15[2],xmm12[2],xmm15[3],xmm12[3]
2023; AVX2-FCP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm0
2024; AVX2-FCP-NEXT:    vmovdqa (%rcx), %xmm8
2025; AVX2-FCP-NEXT:    vmovdqa 32(%rcx), %xmm6
2026; AVX2-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2027; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm8[1,2,2,3]
2028; AVX2-FCP-NEXT:    vmovdqa (%rdx), %xmm9
2029; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %xmm10
2030; AVX2-FCP-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2031; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,3]
2032; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2033; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
2034; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7]
2035; AVX2-FCP-NEXT:    vmovdqa (%r8), %xmm0
2036; AVX2-FCP-NEXT:    vmovdqa 32(%r8), %xmm4
2037; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero
2038; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7]
2039; AVX2-FCP-NEXT:    vpbroadcastd 4(%r9), %ymm5
2040; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7]
2041; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2042; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2043; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2044; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
2045; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm6[1,2,2,3]
2046; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[1,2,2,3]
2047; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1]
2048; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1]
2049; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7]
2050; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
2051; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7]
2052; AVX2-FCP-NEXT:    vpbroadcastd 36(%r9), %ymm5
2053; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7]
2054; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2055; AVX2-FCP-NEXT:    vpbroadcastd 32(%rcx), %xmm3
2056; AVX2-FCP-NEXT:    vpbroadcastd 32(%rdx), %xmm5
2057; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
2058; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %ymm10
2059; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2060; AVX2-FCP-NEXT:    vmovdqa 32(%rcx), %ymm11
2061; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
2062; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7]
2063; AVX2-FCP-NEXT:    vpbroadcastq %xmm4, %ymm2
2064; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
2065; AVX2-FCP-NEXT:    vpbroadcastd 32(%r9), %ymm2
2066; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
2067; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2068; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm11[0,1,2,2,4,5,6,6]
2069; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm10[1,1,2,3,5,5,6,7]
2070; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
2071; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
2072; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2073; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
2074; AVX2-FCP-NEXT:    vmovdqa 32(%rsi), %ymm14
2075; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm14[2],ymm2[3],ymm14[3],ymm2[6],ymm14[6],ymm2[7],ymm14[7]
2076; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
2077; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero
2078; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7]
2079; AVX2-FCP-NEXT:    vpbroadcastd 52(%r9), %ymm3
2080; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7]
2081; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2082; AVX2-FCP-NEXT:    vpbroadcastd (%rcx), %xmm1
2083; AVX2-FCP-NEXT:    vpbroadcastd (%rdx), %xmm3
2084; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2085; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm12[0],xmm15[1],xmm12[1]
2086; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
2087; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7]
2088; AVX2-FCP-NEXT:    vpbroadcastq %xmm0, %ymm0
2089; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
2090; AVX2-FCP-NEXT:    vpbroadcastd (%r9), %ymm1
2091; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
2092; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2093; AVX2-FCP-NEXT:    vmovdqa (%rdx), %ymm5
2094; AVX2-FCP-NEXT:    vmovdqa (%rcx), %ymm1
2095; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,2,4,5,6,6]
2096; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm5[1,1,2,3,5,5,6,7]
2097; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7]
2098; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm3[2,1,2,3]
2099; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm13
2100; AVX2-FCP-NEXT:    vmovdqa (%rsi), %ymm4
2101; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm3 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7]
2102; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5],ymm15[6,7]
2103; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero
2104; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5,6,7]
2105; AVX2-FCP-NEXT:    vpbroadcastd 20(%r9), %ymm15
2106; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7]
2107; AVX2-FCP-NEXT:    vmovdqu %ymm6, (%rsp) # 32-byte Spill
2108; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
2109; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3]
2110; AVX2-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm7, %ymm7
2111; AVX2-FCP-NEXT:    vmovdqa (%r8), %ymm8
2112; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [2,2,3,3,2,2,3,3]
2113; AVX2-FCP-NEXT:    # ymm12 = mem[0,1,0,1]
2114; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm12, %ymm9
2115; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3,4,5],ymm9[6,7]
2116; AVX2-FCP-NEXT:    vmovdqa (%r9), %ymm9
2117; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm12, %ymm15
2118; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0],ymm15[1],ymm7[2,3,4,5,6],ymm15[7]
2119; AVX2-FCP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2120; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm15 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7]
2121; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7]
2122; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm15[2,3]
2123; AVX2-FCP-NEXT:    vmovdqa 32(%r8), %ymm15
2124; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [6,5,3,3,6,5,7,7]
2125; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm6, %ymm7
2126; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5],ymm7[6,7]
2127; AVX2-FCP-NEXT:    vmovdqa 32(%r9), %ymm7
2128; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [4,6,2,3,4,6,6,7]
2129; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm0, %ymm6
2130; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4,5,6],ymm6[7]
2131; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2132; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2133; AVX2-FCP-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload
2134; AVX2-FCP-NEXT:    # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3]
2135; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
2136; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2137; AVX2-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm2, %ymm6
2138; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm12, %ymm2
2139; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3,4,5],ymm2[6,7]
2140; AVX2-FCP-NEXT:    vpermd %ymm7, %ymm12, %ymm6
2141; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4,5,6],ymm6[7]
2142; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm6 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5]
2143; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
2144; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm7 = ymm7[0],ymm14[0],ymm7[1],ymm14[1],ymm7[4],ymm14[4],ymm7[5],ymm14[5]
2145; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2]
2146; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3]
2147; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7]
2148; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm15[4,5],ymm6[6,7]
2149; AVX2-FCP-NEXT:    vpbroadcastd 48(%r9), %ymm7
2150; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
2151; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm7 = ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[6],ymm1[6],ymm5[7],ymm1[7]
2152; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7]
2153; AVX2-FCP-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm7[2,3]
2154; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7]
2155; AVX2-FCP-NEXT:    vpermd %ymm8, %ymm7, %ymm7
2156; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3,4,5],ymm7[6,7]
2157; AVX2-FCP-NEXT:    vpermd %ymm9, %ymm0, %ymm7
2158; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6],ymm7[7]
2159; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[4],ymm1[4],ymm5[5],ymm1[5]
2160; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm1 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[4],ymm4[4],ymm13[5],ymm4[5]
2161; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
2162; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
2163; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
2164; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7]
2165; AVX2-FCP-NEXT:    vpbroadcastd 16(%r9), %ymm1
2166; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
2167; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2168; AVX2-FCP-NEXT:    vmovdqa %ymm0, 96(%rax)
2169; AVX2-FCP-NEXT:    vmovdqa %ymm3, 160(%rax)
2170; AVX2-FCP-NEXT:    vmovdqa %ymm6, 288(%rax)
2171; AVX2-FCP-NEXT:    vmovdqa %ymm2, 256(%rax)
2172; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2173; AVX2-FCP-NEXT:    vmovaps %ymm0, 352(%rax)
2174; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2175; AVX2-FCP-NEXT:    vmovaps %ymm0, 64(%rax)
2176; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
2177; AVX2-FCP-NEXT:    vmovaps %ymm0, 128(%rax)
2178; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2179; AVX2-FCP-NEXT:    vmovaps %ymm0, (%rax)
2180; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2181; AVX2-FCP-NEXT:    vmovaps %ymm0, 320(%rax)
2182; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2183; AVX2-FCP-NEXT:    vmovaps %ymm0, 192(%rax)
2184; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2185; AVX2-FCP-NEXT:    vmovaps %ymm0, 224(%rax)
2186; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2187; AVX2-FCP-NEXT:    vmovaps %ymm0, 32(%rax)
2188; AVX2-FCP-NEXT:    addq $232, %rsp
2189; AVX2-FCP-NEXT:    vzeroupper
2190; AVX2-FCP-NEXT:    retq
2191;
2192; AVX512-LABEL: store_i32_stride6_vf16:
2193; AVX512:       # %bb.0:
2194; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2195; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm2
2196; AVX512-NEXT:    vmovdqa64 (%rsi), %zmm3
2197; AVX512-NEXT:    vmovdqa64 (%rdx), %zmm4
2198; AVX512-NEXT:    vmovdqa64 (%rcx), %zmm5
2199; AVX512-NEXT:    vmovdqa64 (%r8), %zmm1
2200; AVX512-NEXT:    vmovdqa64 (%r9), %zmm0
2201; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
2202; AVX512-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
2203; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
2204; AVX512-NEXT:    vmovdqa (%rdx), %ymm7
2205; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12]
2206; AVX512-NEXT:    vpermi2d (%rcx), %ymm7, %ymm8
2207; AVX512-NEXT:    movb $36, %cl
2208; AVX512-NEXT:    kmovw %ecx, %k1
2209; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm8[0,1,0,1,2,3,6,7]
2210; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15]
2211; AVX512-NEXT:    vpermi2d %zmm1, %zmm6, %zmm7
2212; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
2213; AVX512-NEXT:    vpermi2d %zmm0, %zmm7, %zmm6
2214; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28]
2215; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm7
2216; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
2217; AVX512-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2218; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
2219; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k1}
2220; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15]
2221; AVX512-NEXT:    vpermi2d %zmm1, %zmm8, %zmm7
2222; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
2223; AVX512-NEXT:    vpermi2d %zmm0, %zmm7, %zmm8
2224; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26]
2225; AVX512-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2226; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm7
2227; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
2228; AVX512-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
2229; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm9
2230; AVX512-NEXT:    movb $-110, %cl
2231; AVX512-NEXT:    kmovw %ecx, %k2
2232; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k2}
2233; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15]
2234; AVX512-NEXT:    vpermi2d %zmm1, %zmm9, %zmm7
2235; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
2236; AVX512-NEXT:    vpermi2d %zmm0, %zmm7, %zmm9
2237; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
2238; AVX512-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2239; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm7
2240; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
2241; AVX512-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
2242; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
2243; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k2}
2244; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15]
2245; AVX512-NEXT:    vpermi2d %zmm1, %zmm10, %zmm7
2246; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
2247; AVX512-NEXT:    vpermi2d %zmm0, %zmm7, %zmm10
2248; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22]
2249; AVX512-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2250; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm7
2251; AVX512-NEXT:    vmovdqa (%rdi), %ymm11
2252; AVX512-NEXT:    vpunpckhdq {{.*#+}} ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7]
2253; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k1} = zmm11[2,3,2,3,2,3,2,3]
2254; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0]
2255; AVX512-NEXT:    vpermi2d %zmm1, %zmm7, %zmm11
2256; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
2257; AVX512-NEXT:    vpermi2d %zmm0, %zmm11, %zmm7
2258; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30]
2259; AVX512-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
2260; AVX512-NEXT:    vpermi2d %zmm5, %zmm4, %zmm11
2261; AVX512-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[14],zmm3[14],zmm2[15],zmm3[15]
2262; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm2[6,7,6,7,6,7,6,7]
2263; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0]
2264; AVX512-NEXT:    vpermi2d %zmm1, %zmm11, %zmm2
2265; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
2266; AVX512-NEXT:    vpermi2d %zmm0, %zmm2, %zmm1
2267; AVX512-NEXT:    vmovdqa64 %zmm10, (%rax)
2268; AVX512-NEXT:    vmovdqa64 %zmm9, 192(%rax)
2269; AVX512-NEXT:    vmovdqa64 %zmm1, 320(%rax)
2270; AVX512-NEXT:    vmovdqa64 %zmm8, 256(%rax)
2271; AVX512-NEXT:    vmovdqa64 %zmm7, 128(%rax)
2272; AVX512-NEXT:    vmovdqa64 %zmm6, 64(%rax)
2273; AVX512-NEXT:    vzeroupper
2274; AVX512-NEXT:    retq
2275;
2276; AVX512-FCP-LABEL: store_i32_stride6_vf16:
2277; AVX512-FCP:       # %bb.0:
2278; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2279; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm3
2280; AVX512-FCP-NEXT:    vmovdqa64 (%rsi), %zmm5
2281; AVX512-FCP-NEXT:    vmovdqa64 (%rdx), %zmm2
2282; AVX512-FCP-NEXT:    vmovdqa64 (%rcx), %zmm4
2283; AVX512-FCP-NEXT:    vmovdqa64 (%r8), %zmm1
2284; AVX512-FCP-NEXT:    vmovdqa64 (%r9), %zmm0
2285; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
2286; AVX512-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
2287; AVX512-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm6
2288; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
2289; AVX512-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2290; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm3, %zmm7
2291; AVX512-FCP-NEXT:    movb $-110, %cl
2292; AVX512-FCP-NEXT:    kmovw %ecx, %k2
2293; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm7 {%k2}
2294; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15]
2295; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm7, %zmm8
2296; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
2297; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm8, %zmm6
2298; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,3,19,0,0,7,23,4,20,0,0,0,0]
2299; AVX512-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm7
2300; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
2301; AVX512-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2302; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm3, %zmm8
2303; AVX512-FCP-NEXT:    movb $36, %cl
2304; AVX512-FCP-NEXT:    kmovw %ecx, %k1
2305; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k1}
2306; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15]
2307; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm8, %zmm7
2308; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
2309; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm7, %zmm8
2310; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23]
2311; AVX512-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2312; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm3, %zmm7
2313; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22]
2314; AVX512-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
2315; AVX512-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm9
2316; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
2317; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0]
2318; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm9, %zmm7
2319; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
2320; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm7, %zmm9
2321; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26]
2322; AVX512-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2323; AVX512-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm7
2324; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
2325; AVX512-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
2326; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm3, %zmm10
2327; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k2}
2328; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15]
2329; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm10, %zmm7
2330; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
2331; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm7, %zmm10
2332; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28]
2333; AVX512-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm7
2334; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
2335; AVX512-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
2336; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm3, %zmm11
2337; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm11 {%k1}
2338; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15]
2339; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm11, %zmm7
2340; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
2341; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm7, %zmm11
2342; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31]
2343; AVX512-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2344; AVX512-FCP-NEXT:    vpermi2d %zmm5, %zmm3, %zmm7
2345; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30]
2346; AVX512-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
2347; AVX512-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm3
2348; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm3 {%k1}
2349; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0]
2350; AVX512-FCP-NEXT:    vpermi2d %zmm1, %zmm3, %zmm2
2351; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
2352; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm2, %zmm1
2353; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 320(%rax)
2354; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, 256(%rax)
2355; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, 192(%rax)
2356; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, 128(%rax)
2357; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 64(%rax)
2358; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, (%rax)
2359; AVX512-FCP-NEXT:    vzeroupper
2360; AVX512-FCP-NEXT:    retq
2361;
2362; AVX512DQ-LABEL: store_i32_stride6_vf16:
2363; AVX512DQ:       # %bb.0:
2364; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2365; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm2
2366; AVX512DQ-NEXT:    vmovdqa64 (%rsi), %zmm3
2367; AVX512DQ-NEXT:    vmovdqa64 (%rdx), %zmm4
2368; AVX512DQ-NEXT:    vmovdqa64 (%rcx), %zmm5
2369; AVX512DQ-NEXT:    vmovdqa64 (%r8), %zmm1
2370; AVX512DQ-NEXT:    vmovdqa64 (%r9), %zmm0
2371; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
2372; AVX512DQ-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
2373; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
2374; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm7
2375; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12]
2376; AVX512DQ-NEXT:    vpermi2d (%rcx), %ymm7, %ymm8
2377; AVX512DQ-NEXT:    movb $36, %cl
2378; AVX512DQ-NEXT:    kmovw %ecx, %k1
2379; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm8[0,1,0,1,2,3,6,7]
2380; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15]
2381; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm6, %zmm7
2382; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
2383; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm7, %zmm6
2384; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28]
2385; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm7
2386; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
2387; AVX512DQ-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2388; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
2389; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k1}
2390; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15]
2391; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm8, %zmm7
2392; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
2393; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm7, %zmm8
2394; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26]
2395; AVX512DQ-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2396; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm7
2397; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
2398; AVX512DQ-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
2399; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm9
2400; AVX512DQ-NEXT:    movb $-110, %cl
2401; AVX512DQ-NEXT:    kmovw %ecx, %k2
2402; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k2}
2403; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15]
2404; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm9, %zmm7
2405; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
2406; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm7, %zmm9
2407; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
2408; AVX512DQ-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2409; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm7
2410; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
2411; AVX512DQ-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
2412; AVX512DQ-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
2413; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k2}
2414; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15]
2415; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm10, %zmm7
2416; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
2417; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm7, %zmm10
2418; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22]
2419; AVX512DQ-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2420; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm7
2421; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm11
2422; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7]
2423; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k1} = zmm11[2,3,2,3,2,3,2,3]
2424; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0]
2425; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm7, %zmm11
2426; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
2427; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm11, %zmm7
2428; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30]
2429; AVX512DQ-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
2430; AVX512DQ-NEXT:    vpermi2d %zmm5, %zmm4, %zmm11
2431; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[14],zmm3[14],zmm2[15],zmm3[15]
2432; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm2[6,7,6,7,6,7,6,7]
2433; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0]
2434; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm11, %zmm2
2435; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
2436; AVX512DQ-NEXT:    vpermi2d %zmm0, %zmm2, %zmm1
2437; AVX512DQ-NEXT:    vmovdqa64 %zmm10, (%rax)
2438; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 192(%rax)
2439; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 320(%rax)
2440; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 256(%rax)
2441; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 128(%rax)
2442; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 64(%rax)
2443; AVX512DQ-NEXT:    vzeroupper
2444; AVX512DQ-NEXT:    retq
2445;
2446; AVX512DQ-FCP-LABEL: store_i32_stride6_vf16:
2447; AVX512DQ-FCP:       # %bb.0:
2448; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2449; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm3
2450; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rsi), %zmm5
2451; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdx), %zmm2
2452; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rcx), %zmm4
2453; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r8), %zmm1
2454; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r9), %zmm0
2455; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
2456; AVX512DQ-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
2457; AVX512DQ-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm6
2458; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
2459; AVX512DQ-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2460; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm3, %zmm7
2461; AVX512DQ-FCP-NEXT:    movb $-110, %cl
2462; AVX512DQ-FCP-NEXT:    kmovw %ecx, %k2
2463; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm7 {%k2}
2464; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15]
2465; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm7, %zmm8
2466; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
2467; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm8, %zmm6
2468; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,3,19,0,0,7,23,4,20,0,0,0,0]
2469; AVX512DQ-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm7
2470; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
2471; AVX512DQ-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2472; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm3, %zmm8
2473; AVX512DQ-FCP-NEXT:    movb $36, %cl
2474; AVX512DQ-FCP-NEXT:    kmovw %ecx, %k1
2475; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k1}
2476; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15]
2477; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm8, %zmm7
2478; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
2479; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm7, %zmm8
2480; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23]
2481; AVX512DQ-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2482; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm3, %zmm7
2483; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22]
2484; AVX512DQ-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
2485; AVX512DQ-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm9
2486; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
2487; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0]
2488; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm9, %zmm7
2489; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
2490; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm7, %zmm9
2491; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26]
2492; AVX512DQ-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2493; AVX512DQ-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm7
2494; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
2495; AVX512DQ-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
2496; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm3, %zmm10
2497; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k2}
2498; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15]
2499; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm10, %zmm7
2500; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
2501; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm7, %zmm10
2502; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28]
2503; AVX512DQ-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm7
2504; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
2505; AVX512DQ-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
2506; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm3, %zmm11
2507; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm11 {%k1}
2508; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15]
2509; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm11, %zmm7
2510; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
2511; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm7, %zmm11
2512; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31]
2513; AVX512DQ-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2514; AVX512DQ-FCP-NEXT:    vpermi2d %zmm5, %zmm3, %zmm7
2515; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30]
2516; AVX512DQ-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
2517; AVX512DQ-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm3
2518; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm3 {%k1}
2519; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0]
2520; AVX512DQ-FCP-NEXT:    vpermi2d %zmm1, %zmm3, %zmm2
2521; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
2522; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm2, %zmm1
2523; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 320(%rax)
2524; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, 256(%rax)
2525; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, 192(%rax)
2526; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, 128(%rax)
2527; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 64(%rax)
2528; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, (%rax)
2529; AVX512DQ-FCP-NEXT:    vzeroupper
2530; AVX512DQ-FCP-NEXT:    retq
2531;
2532; AVX512BW-LABEL: store_i32_stride6_vf16:
2533; AVX512BW:       # %bb.0:
2534; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2535; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm2
2536; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm3
2537; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm4
2538; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm5
2539; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm1
2540; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm0
2541; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
2542; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
2543; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
2544; AVX512BW-NEXT:    vmovdqa (%rdx), %ymm7
2545; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12]
2546; AVX512BW-NEXT:    vpermi2d (%rcx), %ymm7, %ymm8
2547; AVX512BW-NEXT:    movb $36, %cl
2548; AVX512BW-NEXT:    kmovd %ecx, %k1
2549; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm8[0,1,0,1,2,3,6,7]
2550; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15]
2551; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm6, %zmm7
2552; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
2553; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm7, %zmm6
2554; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28]
2555; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm7
2556; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
2557; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2558; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
2559; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k1}
2560; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15]
2561; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm8, %zmm7
2562; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
2563; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm7, %zmm8
2564; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26]
2565; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2566; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm7
2567; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
2568; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
2569; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm9
2570; AVX512BW-NEXT:    movb $-110, %cl
2571; AVX512BW-NEXT:    kmovd %ecx, %k2
2572; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k2}
2573; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15]
2574; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm9, %zmm7
2575; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
2576; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm7, %zmm9
2577; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
2578; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2579; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm7
2580; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
2581; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
2582; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
2583; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k2}
2584; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15]
2585; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm10, %zmm7
2586; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
2587; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm7, %zmm10
2588; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22]
2589; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2590; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm7
2591; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm11
2592; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7]
2593; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k1} = zmm11[2,3,2,3,2,3,2,3]
2594; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0]
2595; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm7, %zmm11
2596; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
2597; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm11, %zmm7
2598; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30]
2599; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
2600; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm11
2601; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[14],zmm3[14],zmm2[15],zmm3[15]
2602; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm2[6,7,6,7,6,7,6,7]
2603; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0]
2604; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm11, %zmm2
2605; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
2606; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm2, %zmm1
2607; AVX512BW-NEXT:    vmovdqa64 %zmm10, (%rax)
2608; AVX512BW-NEXT:    vmovdqa64 %zmm9, 192(%rax)
2609; AVX512BW-NEXT:    vmovdqa64 %zmm1, 320(%rax)
2610; AVX512BW-NEXT:    vmovdqa64 %zmm8, 256(%rax)
2611; AVX512BW-NEXT:    vmovdqa64 %zmm7, 128(%rax)
2612; AVX512BW-NEXT:    vmovdqa64 %zmm6, 64(%rax)
2613; AVX512BW-NEXT:    vzeroupper
2614; AVX512BW-NEXT:    retq
2615;
2616; AVX512BW-FCP-LABEL: store_i32_stride6_vf16:
2617; AVX512BW-FCP:       # %bb.0:
2618; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2619; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm3
2620; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %zmm5
2621; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm2
2622; AVX512BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm4
2623; AVX512BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm1
2624; AVX512BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm0
2625; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
2626; AVX512BW-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
2627; AVX512BW-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm6
2628; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
2629; AVX512BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2630; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm3, %zmm7
2631; AVX512BW-FCP-NEXT:    movb $-110, %cl
2632; AVX512BW-FCP-NEXT:    kmovd %ecx, %k2
2633; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm7 {%k2}
2634; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15]
2635; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm7, %zmm8
2636; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
2637; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm8, %zmm6
2638; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,3,19,0,0,7,23,4,20,0,0,0,0]
2639; AVX512BW-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm7
2640; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
2641; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2642; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm3, %zmm8
2643; AVX512BW-FCP-NEXT:    movb $36, %cl
2644; AVX512BW-FCP-NEXT:    kmovd %ecx, %k1
2645; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k1}
2646; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15]
2647; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm8, %zmm7
2648; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
2649; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm7, %zmm8
2650; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23]
2651; AVX512BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2652; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm3, %zmm7
2653; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22]
2654; AVX512BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
2655; AVX512BW-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm9
2656; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
2657; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0]
2658; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm9, %zmm7
2659; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
2660; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm7, %zmm9
2661; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26]
2662; AVX512BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2663; AVX512BW-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm7
2664; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
2665; AVX512BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
2666; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm3, %zmm10
2667; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k2}
2668; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15]
2669; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm10, %zmm7
2670; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
2671; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm7, %zmm10
2672; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28]
2673; AVX512BW-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm7
2674; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
2675; AVX512BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
2676; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm3, %zmm11
2677; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm11 {%k1}
2678; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15]
2679; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm11, %zmm7
2680; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
2681; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm7, %zmm11
2682; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31]
2683; AVX512BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2684; AVX512BW-FCP-NEXT:    vpermi2d %zmm5, %zmm3, %zmm7
2685; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30]
2686; AVX512BW-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
2687; AVX512BW-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm3
2688; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm3 {%k1}
2689; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0]
2690; AVX512BW-FCP-NEXT:    vpermi2d %zmm1, %zmm3, %zmm2
2691; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
2692; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm2, %zmm1
2693; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, 320(%rax)
2694; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, 256(%rax)
2695; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, 192(%rax)
2696; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, 128(%rax)
2697; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, 64(%rax)
2698; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, (%rax)
2699; AVX512BW-FCP-NEXT:    vzeroupper
2700; AVX512BW-FCP-NEXT:    retq
2701;
2702; AVX512DQ-BW-LABEL: store_i32_stride6_vf16:
2703; AVX512DQ-BW:       # %bb.0:
2704; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2705; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm2
2706; AVX512DQ-BW-NEXT:    vmovdqa64 (%rsi), %zmm3
2707; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdx), %zmm4
2708; AVX512DQ-BW-NEXT:    vmovdqa64 (%rcx), %zmm5
2709; AVX512DQ-BW-NEXT:    vmovdqa64 (%r8), %zmm1
2710; AVX512DQ-BW-NEXT:    vmovdqa64 (%r9), %zmm0
2711; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
2712; AVX512DQ-BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
2713; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
2714; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %ymm7
2715; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm8 = [3,11,0,8,7,15,4,12]
2716; AVX512DQ-BW-NEXT:    vpermi2d (%rcx), %ymm7, %ymm8
2717; AVX512DQ-BW-NEXT:    movb $36, %cl
2718; AVX512DQ-BW-NEXT:    kmovd %ecx, %k1
2719; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm8[0,1,0,1,2,3,6,7]
2720; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15]
2721; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm6, %zmm7
2722; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
2723; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm7, %zmm6
2724; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28]
2725; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm7
2726; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
2727; AVX512DQ-BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2728; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
2729; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k1}
2730; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15]
2731; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm8, %zmm7
2732; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
2733; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm7, %zmm8
2734; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26]
2735; AVX512DQ-BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2736; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm7
2737; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
2738; AVX512DQ-BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
2739; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm9
2740; AVX512DQ-BW-NEXT:    movb $-110, %cl
2741; AVX512DQ-BW-NEXT:    kmovd %ecx, %k2
2742; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k2}
2743; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15]
2744; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm9, %zmm7
2745; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
2746; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm7, %zmm9
2747; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
2748; AVX512DQ-BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2749; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm7
2750; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
2751; AVX512DQ-BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
2752; AVX512DQ-BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
2753; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k2}
2754; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15]
2755; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm10, %zmm7
2756; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
2757; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm7, %zmm10
2758; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22]
2759; AVX512DQ-BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2760; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm7
2761; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm11
2762; AVX512DQ-BW-NEXT:    vpunpckhdq {{.*#+}} ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7]
2763; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k1} = zmm11[2,3,2,3,2,3,2,3]
2764; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0]
2765; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm7, %zmm11
2766; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
2767; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm11, %zmm7
2768; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30]
2769; AVX512DQ-BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
2770; AVX512DQ-BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm11
2771; AVX512DQ-BW-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[14],zmm3[14],zmm2[15],zmm3[15]
2772; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm2[6,7,6,7,6,7,6,7]
2773; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0]
2774; AVX512DQ-BW-NEXT:    vpermi2d %zmm1, %zmm11, %zmm2
2775; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
2776; AVX512DQ-BW-NEXT:    vpermi2d %zmm0, %zmm2, %zmm1
2777; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, (%rax)
2778; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, 192(%rax)
2779; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, 320(%rax)
2780; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, 256(%rax)
2781; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, 128(%rax)
2782; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, 64(%rax)
2783; AVX512DQ-BW-NEXT:    vzeroupper
2784; AVX512DQ-BW-NEXT:    retq
2785;
2786; AVX512DQ-BW-FCP-LABEL: store_i32_stride6_vf16:
2787; AVX512DQ-BW-FCP:       # %bb.0:
2788; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
2789; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm3
2790; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rsi), %zmm5
2791; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm2
2792; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm4
2793; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm1
2794; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm0
2795; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
2796; AVX512DQ-BW-FCP-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
2797; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm6
2798; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
2799; AVX512DQ-BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2800; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm3, %zmm7
2801; AVX512DQ-BW-FCP-NEXT:    movb $-110, %cl
2802; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k2
2803; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm7 {%k2}
2804; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15]
2805; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm7, %zmm8
2806; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
2807; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm8, %zmm6
2808; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,3,19,0,0,7,23,4,20,0,0,0,0]
2809; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm7
2810; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
2811; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
2812; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm3, %zmm8
2813; AVX512DQ-BW-FCP-NEXT:    movb $36, %cl
2814; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k1
2815; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k1}
2816; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15]
2817; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm8, %zmm7
2818; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
2819; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm7, %zmm8
2820; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23]
2821; AVX512DQ-BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2822; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm3, %zmm7
2823; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22]
2824; AVX512DQ-BW-FCP-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
2825; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm9
2826; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
2827; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0]
2828; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm9, %zmm7
2829; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
2830; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm7, %zmm9
2831; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26]
2832; AVX512DQ-BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
2833; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm7
2834; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
2835; AVX512DQ-BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
2836; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm3, %zmm10
2837; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k2}
2838; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15]
2839; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm10, %zmm7
2840; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
2841; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm7, %zmm10
2842; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,0,0,16,11,27,0,0,15,31,12,28,0,0,12,28]
2843; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm7
2844; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
2845; AVX512DQ-BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
2846; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm3, %zmm11
2847; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm11 {%k1}
2848; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15]
2849; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm11, %zmm7
2850; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
2851; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm7, %zmm11
2852; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31]
2853; AVX512DQ-BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2854; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm5, %zmm3, %zmm7
2855; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30]
2856; AVX512DQ-BW-FCP-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
2857; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm3
2858; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm3 {%k1}
2859; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0]
2860; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm1, %zmm3, %zmm2
2861; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
2862; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm2, %zmm1
2863; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, 320(%rax)
2864; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, 256(%rax)
2865; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, 192(%rax)
2866; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, 128(%rax)
2867; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, 64(%rax)
2868; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, (%rax)
2869; AVX512DQ-BW-FCP-NEXT:    vzeroupper
2870; AVX512DQ-BW-FCP-NEXT:    retq
2871  %in.vec0 = load <16 x i32>, ptr %in.vecptr0, align 64
2872  %in.vec1 = load <16 x i32>, ptr %in.vecptr1, align 64
2873  %in.vec2 = load <16 x i32>, ptr %in.vecptr2, align 64
2874  %in.vec3 = load <16 x i32>, ptr %in.vecptr3, align 64
2875  %in.vec4 = load <16 x i32>, ptr %in.vecptr4, align 64
2876  %in.vec5 = load <16 x i32>, ptr %in.vecptr5, align 64
2877  %1 = shufflevector <16 x i32> %in.vec0, <16 x i32> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2878  %2 = shufflevector <16 x i32> %in.vec2, <16 x i32> %in.vec3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2879  %3 = shufflevector <16 x i32> %in.vec4, <16 x i32> %in.vec5, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
2880  %4 = shufflevector <32 x i32> %1, <32 x i32> %2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
2881  %5 = shufflevector <32 x i32> %3, <32 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2882  %6 = shufflevector <64 x i32> %4, <64 x i32> %5, <96 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
2883  %interleaved.vec = shufflevector <96 x i32> %6, <96 x i32> poison, <96 x i32> <i32 0, i32 16, i32 32, i32 48, i32 64, i32 80, i32 1, i32 17, i32 33, i32 49, i32 65, i32 81, i32 2, i32 18, i32 34, i32 50, i32 66, i32 82, i32 3, i32 19, i32 35, i32 51, i32 67, i32 83, i32 4, i32 20, i32 36, i32 52, i32 68, i32 84, i32 5, i32 21, i32 37, i32 53, i32 69, i32 85, i32 6, i32 22, i32 38, i32 54, i32 70, i32 86, i32 7, i32 23, i32 39, i32 55, i32 71, i32 87, i32 8, i32 24, i32 40, i32 56, i32 72, i32 88, i32 9, i32 25, i32 41, i32 57, i32 73, i32 89, i32 10, i32 26, i32 42, i32 58, i32 74, i32 90, i32 11, i32 27, i32 43, i32 59, i32 75, i32 91, i32 12, i32 28, i32 44, i32 60, i32 76, i32 92, i32 13, i32 29, i32 45, i32 61, i32 77, i32 93, i32 14, i32 30, i32 46, i32 62, i32 78, i32 94, i32 15, i32 31, i32 47, i32 63, i32 79, i32 95>
2884  store <96 x i32> %interleaved.vec, ptr %out.vec, align 64
2885  ret void
2886}
2887
2888define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind {
2889; SSE-LABEL: store_i32_stride6_vf32:
2890; SSE:       # %bb.0:
2891; SSE-NEXT:    subq $456, %rsp # imm = 0x1C8
2892; SSE-NEXT:    movaps (%rdi), %xmm9
2893; SSE-NEXT:    movaps 16(%rdi), %xmm10
2894; SSE-NEXT:    movaps (%rsi), %xmm4
2895; SSE-NEXT:    movaps 16(%rsi), %xmm0
2896; SSE-NEXT:    movaps (%rdx), %xmm11
2897; SSE-NEXT:    movaps 16(%rdx), %xmm12
2898; SSE-NEXT:    movaps (%rcx), %xmm5
2899; SSE-NEXT:    movaps 16(%rcx), %xmm1
2900; SSE-NEXT:    movaps (%r8), %xmm6
2901; SSE-NEXT:    movaps 16(%r8), %xmm2
2902; SSE-NEXT:    movaps (%r9), %xmm7
2903; SSE-NEXT:    movaps 16(%r9), %xmm3
2904; SSE-NEXT:    movaps %xmm11, %xmm13
2905; SSE-NEXT:    unpcklps {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1]
2906; SSE-NEXT:    movaps %xmm9, %xmm8
2907; SSE-NEXT:    unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
2908; SSE-NEXT:    movaps %xmm7, %xmm14
2909; SSE-NEXT:    movlhps {{.*#+}} xmm14 = xmm14[0],xmm6[0]
2910; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[2,0],xmm8[2,3]
2911; SSE-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2912; SSE-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm13[0]
2913; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2914; SSE-NEXT:    movaps %xmm6, %xmm8
2915; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[1,1],xmm7[1,1]
2916; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[2,3],xmm8[0,2]
2917; SSE-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2918; SSE-NEXT:    movaps %xmm5, %xmm8
2919; SSE-NEXT:    unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm11[1]
2920; SSE-NEXT:    unpckhps {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3]
2921; SSE-NEXT:    movaps %xmm6, %xmm4
2922; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1]
2923; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm9[2,3]
2924; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2925; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[0,1],xmm8[2,0]
2926; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2927; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[3,3],xmm7[3,3]
2928; SSE-NEXT:    unpckhps {{.*#+}} xmm11 = xmm11[2],xmm5[2],xmm11[3],xmm5[3]
2929; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[2,3],xmm6[0,2]
2930; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2931; SSE-NEXT:    movaps %xmm12, %xmm5
2932; SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
2933; SSE-NEXT:    movaps %xmm10, %xmm4
2934; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
2935; SSE-NEXT:    movaps %xmm3, %xmm6
2936; SSE-NEXT:    movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0]
2937; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3]
2938; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2939; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
2940; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2941; SSE-NEXT:    movaps %xmm2, %xmm4
2942; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1]
2943; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2]
2944; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2945; SSE-NEXT:    movaps %xmm1, %xmm4
2946; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm12[1]
2947; SSE-NEXT:    unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3]
2948; SSE-NEXT:    movaps %xmm2, %xmm0
2949; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
2950; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm10[2,3]
2951; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2952; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[0,1],xmm4[2,0]
2953; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2954; SSE-NEXT:    movaps 32(%rdi), %xmm5
2955; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3]
2956; SSE-NEXT:    movaps 32(%rdx), %xmm6
2957; SSE-NEXT:    unpckhps {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3]
2958; SSE-NEXT:    movaps 32(%rcx), %xmm0
2959; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[2,3],xmm2[0,2]
2960; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2961; SSE-NEXT:    movaps %xmm6, %xmm7
2962; SSE-NEXT:    unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
2963; SSE-NEXT:    movaps 32(%rsi), %xmm1
2964; SSE-NEXT:    movaps %xmm5, %xmm4
2965; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
2966; SSE-NEXT:    movaps 32(%r8), %xmm2
2967; SSE-NEXT:    movaps 32(%r9), %xmm3
2968; SSE-NEXT:    movaps %xmm3, %xmm8
2969; SSE-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0]
2970; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3]
2971; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2972; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0]
2973; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2974; SSE-NEXT:    movaps %xmm2, %xmm4
2975; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1]
2976; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,3],xmm4[0,2]
2977; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2978; SSE-NEXT:    unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
2979; SSE-NEXT:    movaps %xmm0, %xmm1
2980; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1]
2981; SSE-NEXT:    movaps %xmm2, %xmm4
2982; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
2983; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[2,3]
2984; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2985; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0]
2986; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2987; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3]
2988; SSE-NEXT:    unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
2989; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2]
2990; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2991; SSE-NEXT:    movaps 48(%rdx), %xmm6
2992; SSE-NEXT:    movaps 48(%rcx), %xmm0
2993; SSE-NEXT:    movaps %xmm6, %xmm5
2994; SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
2995; SSE-NEXT:    movaps 48(%rdi), %xmm7
2996; SSE-NEXT:    movaps 48(%rsi), %xmm1
2997; SSE-NEXT:    movaps %xmm7, %xmm4
2998; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
2999; SSE-NEXT:    movaps 48(%r8), %xmm2
3000; SSE-NEXT:    movaps 48(%r9), %xmm3
3001; SSE-NEXT:    movaps %xmm3, %xmm8
3002; SSE-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0]
3003; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3]
3004; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3005; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
3006; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3007; SSE-NEXT:    movaps %xmm2, %xmm4
3008; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1]
3009; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2]
3010; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3011; SSE-NEXT:    unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3]
3012; SSE-NEXT:    movaps %xmm0, %xmm1
3013; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1]
3014; SSE-NEXT:    movaps %xmm2, %xmm4
3015; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
3016; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3]
3017; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3018; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0]
3019; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3020; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3]
3021; SSE-NEXT:    unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
3022; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2]
3023; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3024; SSE-NEXT:    movaps 64(%rdx), %xmm6
3025; SSE-NEXT:    movaps 64(%rcx), %xmm0
3026; SSE-NEXT:    movaps %xmm6, %xmm5
3027; SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
3028; SSE-NEXT:    movaps 64(%rdi), %xmm7
3029; SSE-NEXT:    movaps 64(%rsi), %xmm1
3030; SSE-NEXT:    movaps %xmm7, %xmm4
3031; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3032; SSE-NEXT:    movaps 64(%r8), %xmm2
3033; SSE-NEXT:    movaps 64(%r9), %xmm3
3034; SSE-NEXT:    movaps %xmm3, %xmm8
3035; SSE-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0]
3036; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3]
3037; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3038; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
3039; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3040; SSE-NEXT:    movaps %xmm2, %xmm4
3041; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1]
3042; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2]
3043; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3044; SSE-NEXT:    unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3]
3045; SSE-NEXT:    movaps %xmm0, %xmm1
3046; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1]
3047; SSE-NEXT:    movaps %xmm2, %xmm4
3048; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
3049; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3]
3050; SSE-NEXT:    movaps %xmm4, (%rsp) # 16-byte Spill
3051; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0]
3052; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3053; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3]
3054; SSE-NEXT:    unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
3055; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2]
3056; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3057; SSE-NEXT:    movaps 80(%rdx), %xmm6
3058; SSE-NEXT:    movaps 80(%rcx), %xmm0
3059; SSE-NEXT:    movaps %xmm6, %xmm5
3060; SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
3061; SSE-NEXT:    movaps 80(%rdi), %xmm7
3062; SSE-NEXT:    movaps 80(%rsi), %xmm1
3063; SSE-NEXT:    movaps %xmm7, %xmm4
3064; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3065; SSE-NEXT:    movaps 80(%r8), %xmm2
3066; SSE-NEXT:    movaps 80(%r9), %xmm3
3067; SSE-NEXT:    movaps %xmm3, %xmm8
3068; SSE-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0]
3069; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3]
3070; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3071; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
3072; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3073; SSE-NEXT:    movaps %xmm2, %xmm4
3074; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1]
3075; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2]
3076; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3077; SSE-NEXT:    unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3]
3078; SSE-NEXT:    movaps %xmm0, %xmm1
3079; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1]
3080; SSE-NEXT:    movaps %xmm2, %xmm4
3081; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
3082; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3]
3083; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3084; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0]
3085; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3086; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3]
3087; SSE-NEXT:    unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
3088; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2]
3089; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3090; SSE-NEXT:    movaps 96(%rdx), %xmm9
3091; SSE-NEXT:    movaps 96(%rcx), %xmm0
3092; SSE-NEXT:    movaps %xmm9, %xmm14
3093; SSE-NEXT:    unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
3094; SSE-NEXT:    movaps 96(%rdi), %xmm11
3095; SSE-NEXT:    movaps 96(%rsi), %xmm1
3096; SSE-NEXT:    movaps %xmm11, %xmm13
3097; SSE-NEXT:    unpcklps {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
3098; SSE-NEXT:    movaps 96(%r8), %xmm2
3099; SSE-NEXT:    movaps 96(%r9), %xmm3
3100; SSE-NEXT:    movaps %xmm3, %xmm15
3101; SSE-NEXT:    movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0]
3102; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[2,0],xmm13[2,3]
3103; SSE-NEXT:    movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0]
3104; SSE-NEXT:    movaps %xmm2, %xmm4
3105; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1]
3106; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[2,3],xmm4[0,2]
3107; SSE-NEXT:    unpckhps {{.*#+}} xmm11 = xmm11[2],xmm1[2],xmm11[3],xmm1[3]
3108; SSE-NEXT:    movaps %xmm0, %xmm1
3109; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1]
3110; SSE-NEXT:    movaps %xmm2, %xmm8
3111; SSE-NEXT:    unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1]
3112; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,2],xmm11[2,3]
3113; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2,0]
3114; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3]
3115; SSE-NEXT:    unpckhps {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3]
3116; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[2,3],xmm2[0,2]
3117; SSE-NEXT:    movaps 112(%rdx), %xmm3
3118; SSE-NEXT:    movaps 112(%rcx), %xmm12
3119; SSE-NEXT:    movaps %xmm3, %xmm5
3120; SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1]
3121; SSE-NEXT:    movaps 112(%rdi), %xmm2
3122; SSE-NEXT:    movaps 112(%rsi), %xmm10
3123; SSE-NEXT:    movaps %xmm2, %xmm4
3124; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1]
3125; SSE-NEXT:    movaps 112(%r8), %xmm1
3126; SSE-NEXT:    movaps 112(%r9), %xmm7
3127; SSE-NEXT:    movaps %xmm7, %xmm6
3128; SSE-NEXT:    movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0]
3129; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3]
3130; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
3131; SSE-NEXT:    movaps %xmm1, %xmm0
3132; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[1,1]
3133; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2]
3134; SSE-NEXT:    unpckhps {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3]
3135; SSE-NEXT:    movaps %xmm12, %xmm0
3136; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
3137; SSE-NEXT:    movaps %xmm1, %xmm10
3138; SSE-NEXT:    unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1]
3139; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[0,2],xmm2[2,3]
3140; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
3141; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3]
3142; SSE-NEXT:    unpckhps {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3]
3143; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[0,2]
3144; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3145; SSE-NEXT:    movaps %xmm3, 752(%rax)
3146; SSE-NEXT:    movaps %xmm10, 736(%rax)
3147; SSE-NEXT:    movaps %xmm2, 720(%rax)
3148; SSE-NEXT:    movaps %xmm5, 704(%rax)
3149; SSE-NEXT:    movaps %xmm6, 688(%rax)
3150; SSE-NEXT:    movaps %xmm4, 672(%rax)
3151; SSE-NEXT:    movaps %xmm9, 656(%rax)
3152; SSE-NEXT:    movaps %xmm8, 640(%rax)
3153; SSE-NEXT:    movaps %xmm11, 624(%rax)
3154; SSE-NEXT:    movaps %xmm14, 608(%rax)
3155; SSE-NEXT:    movaps %xmm15, 592(%rax)
3156; SSE-NEXT:    movaps %xmm13, 576(%rax)
3157; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3158; SSE-NEXT:    movaps %xmm0, 560(%rax)
3159; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3160; SSE-NEXT:    movaps %xmm0, 544(%rax)
3161; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3162; SSE-NEXT:    movaps %xmm0, 528(%rax)
3163; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3164; SSE-NEXT:    movaps %xmm0, 512(%rax)
3165; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3166; SSE-NEXT:    movaps %xmm0, 496(%rax)
3167; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3168; SSE-NEXT:    movaps %xmm0, 480(%rax)
3169; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3170; SSE-NEXT:    movaps %xmm0, 464(%rax)
3171; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
3172; SSE-NEXT:    movaps %xmm0, 448(%rax)
3173; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3174; SSE-NEXT:    movaps %xmm0, 432(%rax)
3175; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3176; SSE-NEXT:    movaps %xmm0, 416(%rax)
3177; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3178; SSE-NEXT:    movaps %xmm0, 400(%rax)
3179; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3180; SSE-NEXT:    movaps %xmm0, 384(%rax)
3181; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3182; SSE-NEXT:    movaps %xmm0, 368(%rax)
3183; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3184; SSE-NEXT:    movaps %xmm0, 352(%rax)
3185; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3186; SSE-NEXT:    movaps %xmm0, 336(%rax)
3187; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3188; SSE-NEXT:    movaps %xmm0, 320(%rax)
3189; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3190; SSE-NEXT:    movaps %xmm0, 304(%rax)
3191; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3192; SSE-NEXT:    movaps %xmm0, 288(%rax)
3193; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3194; SSE-NEXT:    movaps %xmm0, 272(%rax)
3195; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3196; SSE-NEXT:    movaps %xmm0, 256(%rax)
3197; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3198; SSE-NEXT:    movaps %xmm0, 240(%rax)
3199; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3200; SSE-NEXT:    movaps %xmm0, 224(%rax)
3201; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3202; SSE-NEXT:    movaps %xmm0, 208(%rax)
3203; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3204; SSE-NEXT:    movaps %xmm0, 192(%rax)
3205; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3206; SSE-NEXT:    movaps %xmm0, 176(%rax)
3207; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3208; SSE-NEXT:    movaps %xmm0, 160(%rax)
3209; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3210; SSE-NEXT:    movaps %xmm0, 144(%rax)
3211; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3212; SSE-NEXT:    movaps %xmm0, 128(%rax)
3213; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3214; SSE-NEXT:    movaps %xmm0, 112(%rax)
3215; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3216; SSE-NEXT:    movaps %xmm0, 96(%rax)
3217; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3218; SSE-NEXT:    movaps %xmm0, 80(%rax)
3219; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3220; SSE-NEXT:    movaps %xmm0, 64(%rax)
3221; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3222; SSE-NEXT:    movaps %xmm0, 48(%rax)
3223; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3224; SSE-NEXT:    movaps %xmm0, 32(%rax)
3225; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3226; SSE-NEXT:    movaps %xmm0, 16(%rax)
3227; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3228; SSE-NEXT:    movaps %xmm0, (%rax)
3229; SSE-NEXT:    addq $456, %rsp # imm = 0x1C8
3230; SSE-NEXT:    retq
3231;
3232; AVX-LABEL: store_i32_stride6_vf32:
3233; AVX:       # %bb.0:
3234; AVX-NEXT:    subq $1032, %rsp # imm = 0x408
3235; AVX-NEXT:    vmovaps (%rdi), %ymm12
3236; AVX-NEXT:    vmovaps (%rsi), %ymm8
3237; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3238; AVX-NEXT:    vmovaps (%rdx), %ymm4
3239; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3240; AVX-NEXT:    vmovaps (%rcx), %ymm6
3241; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3242; AVX-NEXT:    vmovaps (%r8), %ymm5
3243; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3244; AVX-NEXT:    vmovaps (%rcx), %xmm1
3245; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3246; AVX-NEXT:    vmovaps 32(%rcx), %xmm2
3247; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3248; AVX-NEXT:    vmovaps (%rdx), %xmm0
3249; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3250; AVX-NEXT:    vmovaps 32(%rdx), %xmm3
3251; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3252; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2]
3253; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
3254; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
3255; AVX-NEXT:    vmovaps (%rsi), %xmm1
3256; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3257; AVX-NEXT:    vmovaps (%rdi), %xmm7
3258; AVX-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3259; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm1[2],xmm7[3],xmm1[3]
3260; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3261; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
3262; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
3263; AVX-NEXT:    vbroadcastss 4(%r8), %xmm1
3264; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
3265; AVX-NEXT:    vbroadcastss 4(%r9), %ymm1
3266; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
3267; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3268; AVX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm12[0],ymm8[0],ymm12[1],ymm8[1],ymm12[4],ymm8[4],ymm12[5],ymm8[5]
3269; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
3270; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm4[0],ymm6[2],ymm4[2]
3271; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
3272; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
3273; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
3274; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7]
3275; AVX-NEXT:    vbroadcastss 16(%r9), %ymm1
3276; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
3277; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3278; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm3[1,2],xmm2[1,2]
3279; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
3280; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
3281; AVX-NEXT:    vmovaps 32(%rsi), %xmm1
3282; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3283; AVX-NEXT:    vmovaps 32(%rdi), %xmm2
3284; AVX-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
3285; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3286; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3287; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
3288; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
3289; AVX-NEXT:    vbroadcastss 36(%r8), %xmm1
3290; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
3291; AVX-NEXT:    vbroadcastss 36(%r9), %ymm1
3292; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
3293; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3294; AVX-NEXT:    vmovaps 32(%rdi), %ymm1
3295; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3296; AVX-NEXT:    vmovaps 32(%rsi), %ymm0
3297; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3298; AVX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
3299; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
3300; AVX-NEXT:    vmovaps 32(%rdx), %ymm8
3301; AVX-NEXT:    vmovaps 32(%rcx), %ymm13
3302; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2]
3303; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3304; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
3305; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
3306; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
3307; AVX-NEXT:    vmovaps 32(%r8), %ymm1
3308; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3309; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
3310; AVX-NEXT:    vbroadcastss 48(%r9), %ymm1
3311; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
3312; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3313; AVX-NEXT:    vmovaps 64(%rcx), %xmm1
3314; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3315; AVX-NEXT:    vmovaps 64(%rdx), %xmm0
3316; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3317; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2]
3318; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
3319; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
3320; AVX-NEXT:    vmovaps 64(%rsi), %xmm1
3321; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3322; AVX-NEXT:    vmovaps 64(%rdi), %xmm2
3323; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3324; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3325; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3326; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
3327; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
3328; AVX-NEXT:    vbroadcastss 68(%r8), %xmm1
3329; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
3330; AVX-NEXT:    vbroadcastss 68(%r9), %ymm1
3331; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
3332; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3333; AVX-NEXT:    vmovaps 64(%rdi), %ymm6
3334; AVX-NEXT:    vmovaps 64(%rsi), %ymm14
3335; AVX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm14[0],ymm6[1],ymm14[1],ymm6[4],ymm14[4],ymm6[5],ymm14[5]
3336; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
3337; AVX-NEXT:    vmovaps 64(%rdx), %ymm2
3338; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3339; AVX-NEXT:    vmovaps 64(%rcx), %ymm1
3340; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3341; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
3342; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
3343; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
3344; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
3345; AVX-NEXT:    vmovaps 64(%r8), %ymm1
3346; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3347; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
3348; AVX-NEXT:    vbroadcastss 80(%r9), %ymm1
3349; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
3350; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3351; AVX-NEXT:    vmovaps 96(%rcx), %xmm9
3352; AVX-NEXT:    vmovaps 96(%rdx), %xmm11
3353; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm11[1,2],xmm9[1,2]
3354; AVX-NEXT:    vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3355; AVX-NEXT:    vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3356; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
3357; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm2
3358; AVX-NEXT:    vmovaps 96(%rsi), %xmm5
3359; AVX-NEXT:    vmovaps 96(%rdi), %xmm4
3360; AVX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
3361; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3362; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm3
3363; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
3364; AVX-NEXT:    vbroadcastss 100(%r8), %xmm3
3365; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
3366; AVX-NEXT:    vbroadcastss 100(%r9), %ymm3
3367; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7]
3368; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3369; AVX-NEXT:    vmovaps 96(%rdi), %ymm7
3370; AVX-NEXT:    vmovaps 96(%rsi), %ymm3
3371; AVX-NEXT:    vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[4],ymm3[4],ymm7[5],ymm3[5]
3372; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm10[2,3,2,3]
3373; AVX-NEXT:    vmovaps 96(%rdx), %ymm10
3374; AVX-NEXT:    vmovaps 96(%rcx), %ymm2
3375; AVX-NEXT:    vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm10[0],ymm2[2],ymm10[2]
3376; AVX-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3377; AVX-NEXT:    vextractf128 $1, %ymm15, %xmm15
3378; AVX-NEXT:    vshufps {{.*#+}} xmm15 = xmm15[0,1,2,0]
3379; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm15[2,3],ymm1[4,5,6,7]
3380; AVX-NEXT:    vmovaps 96(%r8), %ymm1
3381; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3382; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
3383; AVX-NEXT:    vbroadcastss 112(%r9), %ymm15
3384; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7]
3385; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3386; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload
3387; AVX-NEXT:    # ymm1 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7]
3388; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3389; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
3390; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3391; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm12[1,2],ymm0[5,6],ymm12[5,6]
3392; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
3393; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
3394; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
3395; AVX-NEXT:    vbroadcastss 20(%r8), %xmm15
3396; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7]
3397; AVX-NEXT:    vbroadcastss 20(%r9), %ymm15
3398; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7]
3399; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3400; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3401; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
3402; AVX-NEXT:    # ymm15 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
3403; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm8[1,2],ymm13[1,2],ymm8[5,6],ymm13[5,6]
3404; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
3405; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
3406; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7]
3407; AVX-NEXT:    vbroadcastss 52(%r8), %xmm8
3408; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7]
3409; AVX-NEXT:    vbroadcastss 52(%r9), %ymm8
3410; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3],ymm0[4,5,6,7]
3411; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3412; AVX-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm14[2],ymm6[3],ymm14[3],ymm6[6],ymm14[6],ymm6[7],ymm14[7]
3413; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3414; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
3415; AVX-NEXT:    vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
3416; AVX-NEXT:    # ymm0 = ymm14[1,2],mem[1,2],ymm14[5,6],mem[5,6]
3417; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
3418; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
3419; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
3420; AVX-NEXT:    vbroadcastss 84(%r8), %xmm6
3421; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7]
3422; AVX-NEXT:    vbroadcastss 84(%r9), %ymm6
3423; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5,6,7]
3424; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3425; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
3426; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm9[0,0,0,0]
3427; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm11[0,0,0,0]
3428; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3429; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm4
3430; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7]
3431; AVX-NEXT:    vinsertf128 $1, 96(%r8), %ymm0, %ymm0
3432; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
3433; AVX-NEXT:    vbroadcastss 96(%r9), %ymm1
3434; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
3435; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3436; AVX-NEXT:    vunpckhps {{.*#+}} ymm11 = ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[6],ymm3[6],ymm7[7],ymm3[7]
3437; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm10[1,2],ymm2[1,2],ymm10[5,6],ymm2[5,6]
3438; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
3439; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
3440; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7]
3441; AVX-NEXT:    vbroadcastss 116(%r8), %xmm1
3442; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
3443; AVX-NEXT:    vbroadcastss 116(%r9), %ymm1
3444; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
3445; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3446; AVX-NEXT:    vbroadcastss (%rcx), %xmm0
3447; AVX-NEXT:    vbroadcastss (%rdx), %xmm1
3448; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3449; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3450; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
3451; AVX-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
3452; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
3453; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7]
3454; AVX-NEXT:    vinsertf128 $1, (%r8), %ymm1, %ymm1
3455; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
3456; AVX-NEXT:    vbroadcastss (%r9), %ymm1
3457; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
3458; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3459; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
3460; AVX-NEXT:    # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
3461; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
3462; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3463; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
3464; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = mem[2,1,3,3]
3465; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm3
3466; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7]
3467; AVX-NEXT:    vmovaps (%r9), %xmm3
3468; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm3[0,2,2,3]
3469; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
3470; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6],ymm3[7]
3471; AVX-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
3472; AVX-NEXT:    # ymm3 = ymm12[3,0],mem[3,0],ymm12[7,4],mem[7,4]
3473; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
3474; AVX-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
3475; AVX-NEXT:    # ymm3 = mem[2,3],ymm3[2,3]
3476; AVX-NEXT:    vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
3477; AVX-NEXT:    # ymm4 = mem[2,3,2,3]
3478; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7]
3479; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7]
3480; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = mem[2,3,2,3]
3481; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
3482; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7]
3483; AVX-NEXT:    vbroadcastss 32(%rcx), %xmm4
3484; AVX-NEXT:    vbroadcastss 32(%rdx), %xmm6
3485; AVX-NEXT:    vunpcklps {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
3486; AVX-NEXT:    vmovaps (%rsp), %xmm5 # 16-byte Reload
3487; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm6 # 16-byte Folded Reload
3488; AVX-NEXT:    # xmm6 = xmm5[0],mem[0],xmm5[1],mem[1]
3489; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm7
3490; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5,6,7]
3491; AVX-NEXT:    vinsertf128 $1, 32(%r8), %ymm6, %ymm6
3492; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
3493; AVX-NEXT:    vbroadcastss 32(%r9), %ymm6
3494; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7]
3495; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3496; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
3497; AVX-NEXT:    # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3]
3498; AVX-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3]
3499; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3500; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm6
3501; AVX-NEXT:    vpermilps {{.*#+}} xmm7 = mem[2,1,3,3]
3502; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm7, %ymm7
3503; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5],ymm7[6,7]
3504; AVX-NEXT:    vmovaps 32(%r9), %xmm7
3505; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm7[0,2,2,3]
3506; AVX-NEXT:    vinsertf128 $1, %xmm7, %ymm8, %ymm7
3507; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7]
3508; AVX-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload
3509; AVX-NEXT:    # ymm6 = ymm13[3,0],mem[3,0],ymm13[7,4],mem[7,4]
3510; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7]
3511; AVX-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm15[2,3],ymm6[2,3]
3512; AVX-NEXT:    vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
3513; AVX-NEXT:    # ymm8 = mem[2,3,2,3]
3514; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7]
3515; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5],ymm8[6,7]
3516; AVX-NEXT:    vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3]
3517; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7]
3518; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4,5,6],ymm8[7]
3519; AVX-NEXT:    vbroadcastss 64(%rcx), %xmm8
3520; AVX-NEXT:    vbroadcastss 64(%rdx), %xmm9
3521; AVX-NEXT:    vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
3522; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3523; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload
3524; AVX-NEXT:    # xmm9 = xmm5[0],mem[0],xmm5[1],mem[1]
3525; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm12
3526; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3],ymm12[4,5,6,7]
3527; AVX-NEXT:    vinsertf128 $1, 64(%r8), %ymm9, %ymm9
3528; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
3529; AVX-NEXT:    vbroadcastss 64(%r9), %ymm9
3530; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7]
3531; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3532; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
3533; AVX-NEXT:    # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3]
3534; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3]
3535; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
3536; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm12, %ymm9
3537; AVX-NEXT:    vpermilps {{.*#+}} xmm12 = mem[2,1,3,3]
3538; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm12, %ymm12
3539; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3,4,5],ymm12[6,7]
3540; AVX-NEXT:    vmovaps 64(%r9), %xmm12
3541; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm12[0,2,2,3]
3542; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm13, %ymm12
3543; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3,4,5,6],ymm12[7]
3544; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
3545; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm5[3,0],ymm14[3,0],ymm5[7,4],ymm14[7,4]
3546; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7]
3547; AVX-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload
3548; AVX-NEXT:    # ymm5 = mem[2,3],ymm12[2,3]
3549; AVX-NEXT:    vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
3550; AVX-NEXT:    # ymm12 = mem[2,3,2,3]
3551; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7]
3552; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3,4,5],ymm12[6,7]
3553; AVX-NEXT:    vperm2f128 {{.*#+}} ymm12 = mem[2,3,2,3]
3554; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7]
3555; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3,4,5,6],ymm12[7]
3556; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3557; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload
3558; AVX-NEXT:    # xmm12 = xmm10[2],mem[2],xmm10[3],mem[3]
3559; AVX-NEXT:    vshufps {{.*#+}} xmm12 = xmm12[2,3,2,3]
3560; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
3561; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm10, %ymm12
3562; AVX-NEXT:    vpermilps {{.*#+}} xmm13 = mem[2,1,3,3]
3563; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm13, %ymm13
3564; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7]
3565; AVX-NEXT:    vmovaps 96(%r9), %xmm13
3566; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm13[0,2,2,3]
3567; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm14, %ymm13
3568; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7]
3569; AVX-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload
3570; AVX-NEXT:    # ymm10 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4]
3571; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
3572; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm11[2,3],ymm10[2,3]
3573; AVX-NEXT:    vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
3574; AVX-NEXT:    # ymm10 = mem[2,3,2,3]
3575; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[2,1,3,3,6,5,7,7]
3576; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3,4,5],ymm10[6,7]
3577; AVX-NEXT:    vperm2f128 {{.*#+}} ymm10 = mem[2,3,2,3]
3578; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm10[0,2,2,3,4,6,6,7]
3579; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4,5,6],ymm10[7]
3580; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3581; AVX-NEXT:    vmovaps %ymm2, 736(%rax)
3582; AVX-NEXT:    vmovaps %ymm12, 640(%rax)
3583; AVX-NEXT:    vmovaps %ymm5, 544(%rax)
3584; AVX-NEXT:    vmovaps %ymm9, 448(%rax)
3585; AVX-NEXT:    vmovaps %ymm8, 384(%rax)
3586; AVX-NEXT:    vmovaps %ymm6, 352(%rax)
3587; AVX-NEXT:    vmovaps %ymm7, 256(%rax)
3588; AVX-NEXT:    vmovaps %ymm4, 192(%rax)
3589; AVX-NEXT:    vmovaps %ymm3, 160(%rax)
3590; AVX-NEXT:    vmovaps %ymm1, 64(%rax)
3591; AVX-NEXT:    vmovaps %ymm0, (%rax)
3592; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3593; AVX-NEXT:    vmovaps %ymm0, 704(%rax)
3594; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3595; AVX-NEXT:    vmovaps %ymm0, 576(%rax)
3596; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3597; AVX-NEXT:    vmovaps %ymm0, 512(%rax)
3598; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3599; AVX-NEXT:    vmovaps %ymm0, 320(%rax)
3600; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3601; AVX-NEXT:    vmovaps %ymm0, 128(%rax)
3602; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3603; AVX-NEXT:    vmovaps %ymm0, 672(%rax)
3604; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3605; AVX-NEXT:    vmovaps %ymm0, 608(%rax)
3606; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3607; AVX-NEXT:    vmovaps %ymm0, 480(%rax)
3608; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3609; AVX-NEXT:    vmovaps %ymm0, 416(%rax)
3610; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3611; AVX-NEXT:    vmovaps %ymm0, 288(%rax)
3612; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3613; AVX-NEXT:    vmovaps %ymm0, 224(%rax)
3614; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3615; AVX-NEXT:    vmovaps %ymm0, 96(%rax)
3616; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3617; AVX-NEXT:    vmovaps %ymm0, 32(%rax)
3618; AVX-NEXT:    addq $1032, %rsp # imm = 0x408
3619; AVX-NEXT:    vzeroupper
3620; AVX-NEXT:    retq
3621;
3622; AVX2-LABEL: store_i32_stride6_vf32:
3623; AVX2:       # %bb.0:
3624; AVX2-NEXT:    subq $904, %rsp # imm = 0x388
3625; AVX2-NEXT:    vmovdqa (%rsi), %xmm0
3626; AVX2-NEXT:    vmovdqa 32(%rsi), %xmm2
3627; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
3628; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm3
3629; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3630; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3631; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
3632; AVX2-NEXT:    vmovdqa (%rcx), %xmm5
3633; AVX2-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3634; AVX2-NEXT:    vmovdqa 32(%rcx), %xmm8
3635; AVX2-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3636; AVX2-NEXT:    vmovdqa 64(%rcx), %xmm7
3637; AVX2-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3638; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3]
3639; AVX2-NEXT:    vmovdqa (%rdx), %xmm6
3640; AVX2-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3641; AVX2-NEXT:    vmovdqa 32(%rdx), %xmm9
3642; AVX2-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3643; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,3]
3644; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
3645; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1]
3646; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7]
3647; AVX2-NEXT:    vmovdqa (%r8), %xmm11
3648; AVX2-NEXT:    vmovdqa 32(%r8), %xmm12
3649; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm11[0],zero,xmm11[1],zero
3650; AVX2-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3651; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7]
3652; AVX2-NEXT:    vpbroadcastd 4(%r9), %ymm5
3653; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7]
3654; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3655; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm10 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
3656; AVX2-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3657; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3]
3658; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm9[1,2,2,3]
3659; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
3660; AVX2-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm5
3661; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
3662; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7]
3663; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm12[0],zero,xmm12[1],zero
3664; AVX2-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3665; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7]
3666; AVX2-NEXT:    vpbroadcastd 36(%r9), %ymm5
3667; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7]
3668; AVX2-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3669; AVX2-NEXT:    vmovdqa 64(%rdx), %xmm5
3670; AVX2-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3671; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3]
3672; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3]
3673; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
3674; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm4[0,1,2,1]
3675; AVX2-NEXT:    vmovdqa 64(%rsi), %xmm4
3676; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm5
3677; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
3678; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3679; AVX2-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
3680; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7]
3681; AVX2-NEXT:    vmovdqa 64(%r8), %xmm15
3682; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm7 = xmm15[0],zero,xmm15[1],zero
3683; AVX2-NEXT:    vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3684; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7]
3685; AVX2-NEXT:    vpbroadcastd 68(%r9), %ymm7
3686; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7]
3687; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3688; AVX2-NEXT:    vmovdqa 96(%rcx), %xmm6
3689; AVX2-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3690; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,3]
3691; AVX2-NEXT:    vmovdqa 96(%rdx), %xmm7
3692; AVX2-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3693; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,3]
3694; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
3695; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm6[0,1,2,1]
3696; AVX2-NEXT:    vmovdqa 96(%rsi), %xmm14
3697; AVX2-NEXT:    vmovdqa 96(%rdi), %xmm7
3698; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm7[2],xmm14[2],xmm7[3],xmm14[3]
3699; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3700; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm9
3701; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7]
3702; AVX2-NEXT:    vmovdqa 96(%r8), %xmm6
3703; AVX2-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3704; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm9 = xmm6[0],zero,xmm6[1],zero
3705; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7]
3706; AVX2-NEXT:    vpbroadcastd 100(%r9), %ymm9
3707; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7]
3708; AVX2-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3709; AVX2-NEXT:    vpbroadcastd (%rcx), %xmm8
3710; AVX2-NEXT:    vpbroadcastd (%rdx), %xmm9
3711; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
3712; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3713; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
3714; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7]
3715; AVX2-NEXT:    vpbroadcastq %xmm11, %ymm1
3716; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
3717; AVX2-NEXT:    vmovdqa (%r9), %xmm1
3718; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3719; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
3720; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
3721; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3722; AVX2-NEXT:    vmovdqa (%rdx), %ymm0
3723; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3724; AVX2-NEXT:    vmovdqa (%rcx), %ymm6
3725; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm6[0,1,2,2,4,5,6,6]
3726; AVX2-NEXT:    vpshufd {{.*#+}} ymm9 = ymm0[1,1,2,3,5,5,6,7]
3727; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7]
3728; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm8[2,1,2,3]
3729; AVX2-NEXT:    vmovdqa (%rdi), %ymm9
3730; AVX2-NEXT:    vmovdqa (%rsi), %ymm8
3731; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7]
3732; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3733; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7]
3734; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero
3735; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7]
3736; AVX2-NEXT:    vpbroadcastd 20(%r9), %ymm11
3737; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7]
3738; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3739; AVX2-NEXT:    vpbroadcastd 32(%rcx), %xmm10
3740; AVX2-NEXT:    vpbroadcastd 32(%rdx), %xmm11
3741; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
3742; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
3743; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1]
3744; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7]
3745; AVX2-NEXT:    vpbroadcastq %xmm12, %ymm3
3746; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
3747; AVX2-NEXT:    vmovdqa 32(%r9), %xmm0
3748; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3749; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm3
3750; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
3751; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3752; AVX2-NEXT:    vmovdqa 32(%rdx), %ymm3
3753; AVX2-NEXT:    vmovdqa 32(%rcx), %ymm2
3754; AVX2-NEXT:    vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,2,4,5,6,6]
3755; AVX2-NEXT:    vpshufd {{.*#+}} ymm11 = ymm3[1,1,2,3,5,5,6,7]
3756; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4],ymm10[5],ymm11[6],ymm10[7]
3757; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm10[2,1,2,3]
3758; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm11
3759; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm10
3760; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7]
3761; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3762; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7]
3763; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero
3764; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7]
3765; AVX2-NEXT:    vpbroadcastd 52(%r9), %ymm13
3766; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7]
3767; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3768; AVX2-NEXT:    vpbroadcastd 64(%rcx), %xmm12
3769; AVX2-NEXT:    vpbroadcastd 64(%rdx), %xmm13
3770; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
3771; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
3772; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
3773; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3],ymm4[4,5,6,7]
3774; AVX2-NEXT:    vpbroadcastq %xmm15, %ymm5
3775; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7]
3776; AVX2-NEXT:    vmovdqa 64(%r9), %xmm0
3777; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
3778; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm5
3779; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7]
3780; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3781; AVX2-NEXT:    vmovdqa 64(%rdx), %ymm5
3782; AVX2-NEXT:    vmovdqa 64(%rcx), %ymm4
3783; AVX2-NEXT:    vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,2,4,5,6,6]
3784; AVX2-NEXT:    vpshufd {{.*#+}} ymm13 = ymm5[1,1,2,3,5,5,6,7]
3785; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4],ymm12[5],ymm13[6],ymm12[7]
3786; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm12[2,1,2,3]
3787; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm13
3788; AVX2-NEXT:    vmovdqa 64(%rsi), %ymm12
3789; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm15 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7]
3790; AVX2-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3791; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7]
3792; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero
3793; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7]
3794; AVX2-NEXT:    vpbroadcastd 84(%r9), %ymm15
3795; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7]
3796; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3797; AVX2-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3798; AVX2-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3799; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
3800; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1]
3801; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1]
3802; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5,6,7]
3803; AVX2-NEXT:    vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload
3804; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7]
3805; AVX2-NEXT:    vmovdqa 96(%r9), %xmm7
3806; AVX2-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3807; AVX2-NEXT:    vpbroadcastd %xmm7, %ymm7
3808; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7]
3809; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3810; AVX2-NEXT:    vmovdqa 96(%rdx), %ymm0
3811; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3812; AVX2-NEXT:    vmovdqa 96(%rcx), %ymm7
3813; AVX2-NEXT:    vpshufd {{.*#+}} ymm14 = ymm7[0,1,2,2,4,5,6,6]
3814; AVX2-NEXT:    vpshufd {{.*#+}} ymm15 = ymm0[1,1,2,3,5,5,6,7]
3815; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7]
3816; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
3817; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm1
3818; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3819; AVX2-NEXT:    vmovdqa 96(%rsi), %ymm0
3820; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3821; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm15 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
3822; AVX2-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3823; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7]
3824; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero
3825; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7]
3826; AVX2-NEXT:    vpbroadcastd 116(%r9), %ymm15
3827; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7]
3828; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3829; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3830; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
3831; AVX2-NEXT:    # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3]
3832; AVX2-NEXT:    vshufps {{.*#+}} xmm14 = xmm14[2,3,2,3]
3833; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
3834; AVX2-NEXT:    vinsertf128 $1, %xmm14, %ymm15, %ymm14
3835; AVX2-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3836; AVX2-NEXT:    # xmm15 = mem[2,2,3,3]
3837; AVX2-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1]
3838; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7]
3839; AVX2-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3840; AVX2-NEXT:    # xmm15 = mem[2,2,3,3]
3841; AVX2-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1]
3842; AVX2-NEXT:    vblendps {{.*#+}} ymm15 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7]
3843; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5]
3844; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3845; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm9 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[4],ymm6[4],ymm0[5],ymm6[5]
3846; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2]
3847; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3]
3848; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7]
3849; AVX2-NEXT:    vmovdqa (%r8), %ymm9
3850; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7]
3851; AVX2-NEXT:    vpbroadcastd 16(%r9), %ymm14
3852; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm14[5],ymm8[6,7]
3853; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[6],ymm6[6],ymm0[7],ymm6[7]
3854; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
3855; AVX2-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
3856; AVX2-NEXT:    # ymm1 = mem[2,3],ymm1[2,3]
3857; AVX2-NEXT:    vpshufd {{.*#+}} ymm6 = ymm9[2,1,3,3,6,5,7,7]
3858; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
3859; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5],ymm6[6,7]
3860; AVX2-NEXT:    vpshufd {{.*#+}} ymm6 = mem[0,2,2,3,4,6,6,7]
3861; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
3862; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7]
3863; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3864; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
3865; AVX2-NEXT:    # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
3866; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
3867; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
3868; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm9, %ymm1
3869; AVX2-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
3870; AVX2-NEXT:    # xmm9 = mem[2,2,3,3]
3871; AVX2-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1]
3872; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3,4,5],ymm9[6,7]
3873; AVX2-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
3874; AVX2-NEXT:    # xmm9 = mem[2,2,3,3]
3875; AVX2-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1]
3876; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4,5,6],ymm9[7]
3877; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm9 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5]
3878; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
3879; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2]
3880; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3]
3881; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7]
3882; AVX2-NEXT:    vmovdqa 32(%r8), %ymm10
3883; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7]
3884; AVX2-NEXT:    vpbroadcastd 48(%r9), %ymm11
3885; AVX2-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm11[5],ymm9[6,7]
3886; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
3887; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7]
3888; AVX2-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
3889; AVX2-NEXT:    # ymm2 = mem[2,3],ymm2[2,3]
3890; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm10[2,1,3,3,6,5,7,7]
3891; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
3892; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7]
3893; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = mem[0,2,2,3,4,6,6,7]
3894; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
3895; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7]
3896; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3897; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
3898; AVX2-NEXT:    # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
3899; AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3]
3900; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
3901; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm10, %ymm3
3902; AVX2-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
3903; AVX2-NEXT:    # xmm10 = mem[2,2,3,3]
3904; AVX2-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1]
3905; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3,4,5],ymm10[6,7]
3906; AVX2-NEXT:    vpermilps $250, (%rsp), %xmm10 # 16-byte Folded Reload
3907; AVX2-NEXT:    # xmm10 = mem[2,2,3,3]
3908; AVX2-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1]
3909; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7]
3910; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm10 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5]
3911; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm11 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5]
3912; AVX2-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
3913; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3]
3914; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7]
3915; AVX2-NEXT:    vmovdqa 64(%r8), %ymm11
3916; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7]
3917; AVX2-NEXT:    vpbroadcastd 80(%r9), %ymm12
3918; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7]
3919; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm4 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7]
3920; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7]
3921; AVX2-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
3922; AVX2-NEXT:    # ymm4 = mem[2,3],ymm4[2,3]
3923; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm11[2,1,3,3,6,5,7,7]
3924; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3]
3925; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5],ymm5[6,7]
3926; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = mem[0,2,2,3,4,6,6,7]
3927; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3]
3928; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6],ymm5[7]
3929; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3930; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
3931; AVX2-NEXT:    # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
3932; AVX2-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3]
3933; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
3934; AVX2-NEXT:    vinsertf128 $1, %xmm5, %ymm11, %ymm5
3935; AVX2-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
3936; AVX2-NEXT:    # xmm11 = mem[2,2,3,3]
3937; AVX2-NEXT:    vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1]
3938; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3,4,5],ymm11[6,7]
3939; AVX2-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
3940; AVX2-NEXT:    # xmm11 = mem[2,2,3,3]
3941; AVX2-NEXT:    vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1]
3942; AVX2-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2,3,4,5,6],ymm11[7]
3943; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3944; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
3945; AVX2-NEXT:    # ymm11 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
3946; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3947; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm12 = ymm0[0],ymm7[0],ymm0[1],ymm7[1],ymm0[4],ymm7[4],ymm0[5],ymm7[5]
3948; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2]
3949; AVX2-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3]
3950; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7]
3951; AVX2-NEXT:    vmovdqa 96(%r8), %ymm12
3952; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7]
3953; AVX2-NEXT:    vpbroadcastd 112(%r9), %ymm13
3954; AVX2-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7]
3955; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[6],ymm7[6],ymm0[7],ymm7[7]
3956; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
3957; AVX2-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3958; AVX2-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
3959; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm12[2,1,3,3,6,5,7,7]
3960; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3]
3961; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3,4,5],ymm7[6,7]
3962; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = mem[0,2,2,3,4,6,6,7]
3963; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3]
3964; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4,5,6],ymm7[7]
3965; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
3966; AVX2-NEXT:    vmovdqa %ymm0, 736(%rax)
3967; AVX2-NEXT:    vmovdqa %ymm11, 672(%rax)
3968; AVX2-NEXT:    vmovaps %ymm5, 640(%rax)
3969; AVX2-NEXT:    vmovdqa %ymm4, 544(%rax)
3970; AVX2-NEXT:    vmovdqa %ymm10, 480(%rax)
3971; AVX2-NEXT:    vmovaps %ymm3, 448(%rax)
3972; AVX2-NEXT:    vmovdqa %ymm2, 352(%rax)
3973; AVX2-NEXT:    vmovdqa %ymm9, 288(%rax)
3974; AVX2-NEXT:    vmovaps %ymm1, 256(%rax)
3975; AVX2-NEXT:    vmovdqa %ymm6, 160(%rax)
3976; AVX2-NEXT:    vmovdqa %ymm8, 96(%rax)
3977; AVX2-NEXT:    vmovaps %ymm15, 64(%rax)
3978; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3979; AVX2-NEXT:    vmovaps %ymm0, 704(%rax)
3980; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3981; AVX2-NEXT:    vmovaps %ymm0, 576(%rax)
3982; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3983; AVX2-NEXT:    vmovaps %ymm0, 512(%rax)
3984; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3985; AVX2-NEXT:    vmovaps %ymm0, 384(%rax)
3986; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3987; AVX2-NEXT:    vmovaps %ymm0, 320(%rax)
3988; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3989; AVX2-NEXT:    vmovaps %ymm0, 192(%rax)
3990; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3991; AVX2-NEXT:    vmovaps %ymm0, 128(%rax)
3992; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3993; AVX2-NEXT:    vmovaps %ymm0, (%rax)
3994; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3995; AVX2-NEXT:    vmovaps %ymm0, 608(%rax)
3996; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3997; AVX2-NEXT:    vmovaps %ymm0, 416(%rax)
3998; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3999; AVX2-NEXT:    vmovaps %ymm0, 224(%rax)
4000; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4001; AVX2-NEXT:    vmovaps %ymm0, 32(%rax)
4002; AVX2-NEXT:    addq $904, %rsp # imm = 0x388
4003; AVX2-NEXT:    vzeroupper
4004; AVX2-NEXT:    retq
4005;
4006; AVX2-FP-LABEL: store_i32_stride6_vf32:
4007; AVX2-FP:       # %bb.0:
4008; AVX2-FP-NEXT:    subq $904, %rsp # imm = 0x388
4009; AVX2-FP-NEXT:    vmovdqa (%rsi), %xmm0
4010; AVX2-FP-NEXT:    vmovdqa 32(%rsi), %xmm2
4011; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm1
4012; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm3
4013; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4014; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4015; AVX2-FP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
4016; AVX2-FP-NEXT:    vmovdqa (%rcx), %xmm5
4017; AVX2-FP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4018; AVX2-FP-NEXT:    vmovdqa 32(%rcx), %xmm8
4019; AVX2-FP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4020; AVX2-FP-NEXT:    vmovdqa 64(%rcx), %xmm7
4021; AVX2-FP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4022; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3]
4023; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm6
4024; AVX2-FP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4025; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %xmm9
4026; AVX2-FP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4027; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,3]
4028; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
4029; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1]
4030; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7]
4031; AVX2-FP-NEXT:    vmovdqa (%r8), %xmm11
4032; AVX2-FP-NEXT:    vmovdqa 32(%r8), %xmm12
4033; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm11[0],zero,xmm11[1],zero
4034; AVX2-FP-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4035; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7]
4036; AVX2-FP-NEXT:    vpbroadcastd 4(%r9), %ymm5
4037; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7]
4038; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4039; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm10 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
4040; AVX2-FP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4041; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3]
4042; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm9[1,2,2,3]
4043; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
4044; AVX2-FP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm5
4045; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
4046; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7]
4047; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm12[0],zero,xmm12[1],zero
4048; AVX2-FP-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4049; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7]
4050; AVX2-FP-NEXT:    vpbroadcastd 36(%r9), %ymm5
4051; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7]
4052; AVX2-FP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4053; AVX2-FP-NEXT:    vmovdqa 64(%rdx), %xmm5
4054; AVX2-FP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4055; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3]
4056; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3]
4057; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
4058; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm4[0,1,2,1]
4059; AVX2-FP-NEXT:    vmovdqa 64(%rsi), %xmm4
4060; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm5
4061; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
4062; AVX2-FP-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4063; AVX2-FP-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
4064; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7]
4065; AVX2-FP-NEXT:    vmovdqa 64(%r8), %xmm15
4066; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm7 = xmm15[0],zero,xmm15[1],zero
4067; AVX2-FP-NEXT:    vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4068; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7]
4069; AVX2-FP-NEXT:    vpbroadcastd 68(%r9), %ymm7
4070; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7]
4071; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4072; AVX2-FP-NEXT:    vmovdqa 96(%rcx), %xmm6
4073; AVX2-FP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4074; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,3]
4075; AVX2-FP-NEXT:    vmovdqa 96(%rdx), %xmm7
4076; AVX2-FP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4077; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,3]
4078; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
4079; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm6[0,1,2,1]
4080; AVX2-FP-NEXT:    vmovdqa 96(%rsi), %xmm14
4081; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %xmm7
4082; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm7[2],xmm14[2],xmm7[3],xmm14[3]
4083; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4084; AVX2-FP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm9
4085; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7]
4086; AVX2-FP-NEXT:    vmovdqa 96(%r8), %xmm6
4087; AVX2-FP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4088; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm9 = xmm6[0],zero,xmm6[1],zero
4089; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7]
4090; AVX2-FP-NEXT:    vpbroadcastd 100(%r9), %ymm9
4091; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7]
4092; AVX2-FP-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4093; AVX2-FP-NEXT:    vpbroadcastd (%rcx), %xmm8
4094; AVX2-FP-NEXT:    vpbroadcastd (%rdx), %xmm9
4095; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
4096; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4097; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
4098; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7]
4099; AVX2-FP-NEXT:    vpbroadcastq %xmm11, %ymm1
4100; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
4101; AVX2-FP-NEXT:    vmovdqa (%r9), %xmm1
4102; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4103; AVX2-FP-NEXT:    vpbroadcastd %xmm1, %ymm1
4104; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
4105; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4106; AVX2-FP-NEXT:    vmovdqa (%rdx), %ymm0
4107; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4108; AVX2-FP-NEXT:    vmovdqa (%rcx), %ymm6
4109; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm6[0,1,2,2,4,5,6,6]
4110; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm0[1,1,2,3,5,5,6,7]
4111; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7]
4112; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm8[2,1,2,3]
4113; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm9
4114; AVX2-FP-NEXT:    vmovdqa (%rsi), %ymm8
4115; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7]
4116; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4117; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7]
4118; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero
4119; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7]
4120; AVX2-FP-NEXT:    vpbroadcastd 20(%r9), %ymm11
4121; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7]
4122; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4123; AVX2-FP-NEXT:    vpbroadcastd 32(%rcx), %xmm10
4124; AVX2-FP-NEXT:    vpbroadcastd 32(%rdx), %xmm11
4125; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
4126; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4127; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1]
4128; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7]
4129; AVX2-FP-NEXT:    vpbroadcastq %xmm12, %ymm3
4130; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
4131; AVX2-FP-NEXT:    vmovdqa 32(%r9), %xmm0
4132; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4133; AVX2-FP-NEXT:    vpbroadcastd %xmm0, %ymm3
4134; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
4135; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4136; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %ymm3
4137; AVX2-FP-NEXT:    vmovdqa 32(%rcx), %ymm2
4138; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,2,4,5,6,6]
4139; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm11 = ymm3[1,1,2,3,5,5,6,7]
4140; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4],ymm10[5],ymm11[6],ymm10[7]
4141; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm10[2,1,2,3]
4142; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm11
4143; AVX2-FP-NEXT:    vmovdqa 32(%rsi), %ymm10
4144; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7]
4145; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4146; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7]
4147; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero
4148; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7]
4149; AVX2-FP-NEXT:    vpbroadcastd 52(%r9), %ymm13
4150; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7]
4151; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4152; AVX2-FP-NEXT:    vpbroadcastd 64(%rcx), %xmm12
4153; AVX2-FP-NEXT:    vpbroadcastd 64(%rdx), %xmm13
4154; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
4155; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
4156; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
4157; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3],ymm4[4,5,6,7]
4158; AVX2-FP-NEXT:    vpbroadcastq %xmm15, %ymm5
4159; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7]
4160; AVX2-FP-NEXT:    vmovdqa 64(%r9), %xmm0
4161; AVX2-FP-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
4162; AVX2-FP-NEXT:    vpbroadcastd %xmm0, %ymm5
4163; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7]
4164; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4165; AVX2-FP-NEXT:    vmovdqa 64(%rdx), %ymm5
4166; AVX2-FP-NEXT:    vmovdqa 64(%rcx), %ymm4
4167; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,2,4,5,6,6]
4168; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm5[1,1,2,3,5,5,6,7]
4169; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4],ymm12[5],ymm13[6],ymm12[7]
4170; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm12[2,1,2,3]
4171; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm13
4172; AVX2-FP-NEXT:    vmovdqa 64(%rsi), %ymm12
4173; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm15 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7]
4174; AVX2-FP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4175; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7]
4176; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero
4177; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7]
4178; AVX2-FP-NEXT:    vpbroadcastd 84(%r9), %ymm15
4179; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7]
4180; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4181; AVX2-FP-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4182; AVX2-FP-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4183; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
4184; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1]
4185; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1]
4186; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5,6,7]
4187; AVX2-FP-NEXT:    vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload
4188; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7]
4189; AVX2-FP-NEXT:    vmovdqa 96(%r9), %xmm7
4190; AVX2-FP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4191; AVX2-FP-NEXT:    vpbroadcastd %xmm7, %ymm7
4192; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7]
4193; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4194; AVX2-FP-NEXT:    vmovdqa 96(%rdx), %ymm0
4195; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4196; AVX2-FP-NEXT:    vmovdqa 96(%rcx), %ymm7
4197; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm7[0,1,2,2,4,5,6,6]
4198; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm0[1,1,2,3,5,5,6,7]
4199; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7]
4200; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
4201; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm1
4202; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4203; AVX2-FP-NEXT:    vmovdqa 96(%rsi), %ymm0
4204; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4205; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm15 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
4206; AVX2-FP-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4207; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7]
4208; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero
4209; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7]
4210; AVX2-FP-NEXT:    vpbroadcastd 116(%r9), %ymm15
4211; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7]
4212; AVX2-FP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4213; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4214; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
4215; AVX2-FP-NEXT:    # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3]
4216; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm14 = xmm14[2,3,2,3]
4217; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
4218; AVX2-FP-NEXT:    vinsertf128 $1, %xmm14, %ymm15, %ymm14
4219; AVX2-FP-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4220; AVX2-FP-NEXT:    # xmm15 = mem[2,2,3,3]
4221; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1]
4222; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7]
4223; AVX2-FP-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4224; AVX2-FP-NEXT:    # xmm15 = mem[2,2,3,3]
4225; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1]
4226; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm15 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7]
4227; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5]
4228; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4229; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm9 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[4],ymm6[4],ymm0[5],ymm6[5]
4230; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2]
4231; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3]
4232; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7]
4233; AVX2-FP-NEXT:    vmovdqa (%r8), %ymm9
4234; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7]
4235; AVX2-FP-NEXT:    vpbroadcastd 16(%r9), %ymm14
4236; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm14[5],ymm8[6,7]
4237; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[6],ymm6[6],ymm0[7],ymm6[7]
4238; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
4239; AVX2-FP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
4240; AVX2-FP-NEXT:    # ymm1 = mem[2,3],ymm1[2,3]
4241; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm6 = ymm9[2,1,3,3,6,5,7,7]
4242; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
4243; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5],ymm6[6,7]
4244; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm6 = mem[0,2,2,3,4,6,6,7]
4245; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
4246; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7]
4247; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4248; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
4249; AVX2-FP-NEXT:    # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
4250; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
4251; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
4252; AVX2-FP-NEXT:    vinsertf128 $1, %xmm1, %ymm9, %ymm1
4253; AVX2-FP-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
4254; AVX2-FP-NEXT:    # xmm9 = mem[2,2,3,3]
4255; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1]
4256; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3,4,5],ymm9[6,7]
4257; AVX2-FP-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
4258; AVX2-FP-NEXT:    # xmm9 = mem[2,2,3,3]
4259; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1]
4260; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4,5,6],ymm9[7]
4261; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm9 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5]
4262; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
4263; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2]
4264; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3]
4265; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7]
4266; AVX2-FP-NEXT:    vmovdqa 32(%r8), %ymm10
4267; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7]
4268; AVX2-FP-NEXT:    vpbroadcastd 48(%r9), %ymm11
4269; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm11[5],ymm9[6,7]
4270; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
4271; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7]
4272; AVX2-FP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
4273; AVX2-FP-NEXT:    # ymm2 = mem[2,3],ymm2[2,3]
4274; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm10[2,1,3,3,6,5,7,7]
4275; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
4276; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7]
4277; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm3 = mem[0,2,2,3,4,6,6,7]
4278; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
4279; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7]
4280; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4281; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
4282; AVX2-FP-NEXT:    # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
4283; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3]
4284; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
4285; AVX2-FP-NEXT:    vinsertf128 $1, %xmm3, %ymm10, %ymm3
4286; AVX2-FP-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
4287; AVX2-FP-NEXT:    # xmm10 = mem[2,2,3,3]
4288; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1]
4289; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3,4,5],ymm10[6,7]
4290; AVX2-FP-NEXT:    vpermilps $250, (%rsp), %xmm10 # 16-byte Folded Reload
4291; AVX2-FP-NEXT:    # xmm10 = mem[2,2,3,3]
4292; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1]
4293; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7]
4294; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm10 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5]
4295; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm11 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5]
4296; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
4297; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3]
4298; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7]
4299; AVX2-FP-NEXT:    vmovdqa 64(%r8), %ymm11
4300; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7]
4301; AVX2-FP-NEXT:    vpbroadcastd 80(%r9), %ymm12
4302; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7]
4303; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm4 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7]
4304; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7]
4305; AVX2-FP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
4306; AVX2-FP-NEXT:    # ymm4 = mem[2,3],ymm4[2,3]
4307; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm11[2,1,3,3,6,5,7,7]
4308; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3]
4309; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5],ymm5[6,7]
4310; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm5 = mem[0,2,2,3,4,6,6,7]
4311; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3]
4312; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6],ymm5[7]
4313; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4314; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
4315; AVX2-FP-NEXT:    # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
4316; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3]
4317; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
4318; AVX2-FP-NEXT:    vinsertf128 $1, %xmm5, %ymm11, %ymm5
4319; AVX2-FP-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
4320; AVX2-FP-NEXT:    # xmm11 = mem[2,2,3,3]
4321; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1]
4322; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3,4,5],ymm11[6,7]
4323; AVX2-FP-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
4324; AVX2-FP-NEXT:    # xmm11 = mem[2,2,3,3]
4325; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1]
4326; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2,3,4,5,6],ymm11[7]
4327; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4328; AVX2-FP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
4329; AVX2-FP-NEXT:    # ymm11 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
4330; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4331; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm12 = ymm0[0],ymm7[0],ymm0[1],ymm7[1],ymm0[4],ymm7[4],ymm0[5],ymm7[5]
4332; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2]
4333; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3]
4334; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7]
4335; AVX2-FP-NEXT:    vmovdqa 96(%r8), %ymm12
4336; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7]
4337; AVX2-FP-NEXT:    vpbroadcastd 112(%r9), %ymm13
4338; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7]
4339; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[6],ymm7[6],ymm0[7],ymm7[7]
4340; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
4341; AVX2-FP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4342; AVX2-FP-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
4343; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm7 = ymm12[2,1,3,3,6,5,7,7]
4344; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3]
4345; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3,4,5],ymm7[6,7]
4346; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm7 = mem[0,2,2,3,4,6,6,7]
4347; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3]
4348; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4,5,6],ymm7[7]
4349; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4350; AVX2-FP-NEXT:    vmovdqa %ymm0, 736(%rax)
4351; AVX2-FP-NEXT:    vmovdqa %ymm11, 672(%rax)
4352; AVX2-FP-NEXT:    vmovaps %ymm5, 640(%rax)
4353; AVX2-FP-NEXT:    vmovdqa %ymm4, 544(%rax)
4354; AVX2-FP-NEXT:    vmovdqa %ymm10, 480(%rax)
4355; AVX2-FP-NEXT:    vmovaps %ymm3, 448(%rax)
4356; AVX2-FP-NEXT:    vmovdqa %ymm2, 352(%rax)
4357; AVX2-FP-NEXT:    vmovdqa %ymm9, 288(%rax)
4358; AVX2-FP-NEXT:    vmovaps %ymm1, 256(%rax)
4359; AVX2-FP-NEXT:    vmovdqa %ymm6, 160(%rax)
4360; AVX2-FP-NEXT:    vmovdqa %ymm8, 96(%rax)
4361; AVX2-FP-NEXT:    vmovaps %ymm15, 64(%rax)
4362; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4363; AVX2-FP-NEXT:    vmovaps %ymm0, 704(%rax)
4364; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4365; AVX2-FP-NEXT:    vmovaps %ymm0, 576(%rax)
4366; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4367; AVX2-FP-NEXT:    vmovaps %ymm0, 512(%rax)
4368; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4369; AVX2-FP-NEXT:    vmovaps %ymm0, 384(%rax)
4370; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4371; AVX2-FP-NEXT:    vmovaps %ymm0, 320(%rax)
4372; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4373; AVX2-FP-NEXT:    vmovaps %ymm0, 192(%rax)
4374; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4375; AVX2-FP-NEXT:    vmovaps %ymm0, 128(%rax)
4376; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4377; AVX2-FP-NEXT:    vmovaps %ymm0, (%rax)
4378; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4379; AVX2-FP-NEXT:    vmovaps %ymm0, 608(%rax)
4380; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4381; AVX2-FP-NEXT:    vmovaps %ymm0, 416(%rax)
4382; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4383; AVX2-FP-NEXT:    vmovaps %ymm0, 224(%rax)
4384; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4385; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%rax)
4386; AVX2-FP-NEXT:    addq $904, %rsp # imm = 0x388
4387; AVX2-FP-NEXT:    vzeroupper
4388; AVX2-FP-NEXT:    retq
4389;
4390; AVX2-FCP-LABEL: store_i32_stride6_vf32:
4391; AVX2-FCP:       # %bb.0:
4392; AVX2-FCP-NEXT:    subq $872, %rsp # imm = 0x368
4393; AVX2-FCP-NEXT:    vmovdqa (%rsi), %xmm2
4394; AVX2-FCP-NEXT:    vmovdqa 32(%rsi), %xmm4
4395; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm7
4396; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm1
4397; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm2[2],xmm7[3],xmm2[3]
4398; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4399; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
4400; AVX2-FCP-NEXT:    vmovdqa (%rcx), %xmm3
4401; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4402; AVX2-FCP-NEXT:    vmovdqa 32(%rcx), %xmm9
4403; AVX2-FCP-NEXT:    vmovdqa %xmm9, (%rsp) # 16-byte Spill
4404; AVX2-FCP-NEXT:    vmovdqa 64(%rcx), %xmm6
4405; AVX2-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4406; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3]
4407; AVX2-FCP-NEXT:    vmovdqa (%rdx), %xmm5
4408; AVX2-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4409; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %xmm10
4410; AVX2-FCP-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4411; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3]
4412; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
4413; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
4414; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7]
4415; AVX2-FCP-NEXT:    vmovdqa (%r8), %xmm8
4416; AVX2-FCP-NEXT:    vmovdqa 32(%r8), %xmm13
4417; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm8[0],zero,xmm8[1],zero
4418; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7]
4419; AVX2-FCP-NEXT:    vpbroadcastd 4(%r9), %ymm3
4420; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7]
4421; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4422; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
4423; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4424; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm9[1,2,2,3]
4425; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm10[1,2,2,3]
4426; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
4427; AVX2-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm3
4428; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
4429; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7]
4430; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm13[0],zero,xmm13[1],zero
4431; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7]
4432; AVX2-FCP-NEXT:    vpbroadcastd 36(%r9), %ymm3
4433; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7]
4434; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4435; AVX2-FCP-NEXT:    vmovdqa 64(%rdx), %xmm3
4436; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4437; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm6[1,2,2,3]
4438; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3]
4439; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
4440; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
4441; AVX2-FCP-NEXT:    vmovdqa 64(%rsi), %xmm6
4442; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm9
4443; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm6[2],xmm9[3],xmm6[3]
4444; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4445; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
4446; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7]
4447; AVX2-FCP-NEXT:    vmovdqa 64(%r8), %xmm14
4448; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm14[0],zero,xmm14[1],zero
4449; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7]
4450; AVX2-FCP-NEXT:    vpbroadcastd 68(%r9), %ymm3
4451; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7]
4452; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4453; AVX2-FCP-NEXT:    vmovdqa 96(%rcx), %xmm15
4454; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm15[1,2,2,3]
4455; AVX2-FCP-NEXT:    vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4456; AVX2-FCP-NEXT:    vmovdqa 96(%rdx), %xmm3
4457; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4458; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3]
4459; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
4460; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm0[0,1,2,1]
4461; AVX2-FCP-NEXT:    vmovdqa 96(%rsi), %xmm3
4462; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %xmm0
4463; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
4464; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4465; AVX2-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm11
4466; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7]
4467; AVX2-FCP-NEXT:    vmovdqa 96(%r8), %xmm11
4468; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero
4469; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7]
4470; AVX2-FCP-NEXT:    vpbroadcastd 100(%r9), %ymm12
4471; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0,1,2],ymm12[3],ymm10[4,5,6,7]
4472; AVX2-FCP-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4473; AVX2-FCP-NEXT:    vpbroadcastd (%rcx), %xmm10
4474; AVX2-FCP-NEXT:    vpbroadcastd (%rdx), %xmm12
4475; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
4476; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
4477; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1]
4478; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7]
4479; AVX2-FCP-NEXT:    vpbroadcastq %xmm8, %ymm7
4480; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5],ymm2[6,7]
4481; AVX2-FCP-NEXT:    vpbroadcastd (%r9), %ymm7
4482; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7]
4483; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4484; AVX2-FCP-NEXT:    vmovdqa (%rdx), %ymm7
4485; AVX2-FCP-NEXT:    vmovdqa (%rcx), %ymm5
4486; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm5[0,1,2,2,4,5,6,6]
4487; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm7[1,1,2,3,5,5,6,7]
4488; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7]
4489; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3]
4490; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm2
4491; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4492; AVX2-FCP-NEXT:    vmovdqa (%rsi), %ymm10
4493; AVX2-FCP-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4494; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm10[2],ymm2[3],ymm10[3],ymm2[6],ymm10[6],ymm2[7],ymm10[7]
4495; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4496; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5],ymm8[6,7]
4497; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero
4498; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7]
4499; AVX2-FCP-NEXT:    vpbroadcastd 20(%r9), %ymm12
4500; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm12[3],ymm8[4,5,6,7]
4501; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4502; AVX2-FCP-NEXT:    vpbroadcastd 32(%rcx), %xmm8
4503; AVX2-FCP-NEXT:    vpbroadcastd 32(%rdx), %xmm12
4504; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1]
4505; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
4506; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
4507; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2,3],ymm1[4,5,6,7]
4508; AVX2-FCP-NEXT:    vpbroadcastq %xmm13, %ymm4
4509; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7]
4510; AVX2-FCP-NEXT:    vpbroadcastd 32(%r9), %ymm4
4511; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
4512; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4513; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %ymm10
4514; AVX2-FCP-NEXT:    vmovdqa 32(%rcx), %ymm8
4515; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm8[0,1,2,2,4,5,6,6]
4516; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm10[1,1,2,3,5,5,6,7]
4517; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7]
4518; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
4519; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
4520; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4521; AVX2-FCP-NEXT:    vmovdqa 32(%rsi), %ymm4
4522; AVX2-FCP-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4523; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7]
4524; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4525; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
4526; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero
4527; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7]
4528; AVX2-FCP-NEXT:    vpbroadcastd 52(%r9), %ymm4
4529; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6,7]
4530; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4531; AVX2-FCP-NEXT:    vpbroadcastd 64(%rcx), %xmm1
4532; AVX2-FCP-NEXT:    vpbroadcastd 64(%rdx), %xmm4
4533; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4534; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
4535; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
4536; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7]
4537; AVX2-FCP-NEXT:    vpbroadcastq %xmm14, %ymm4
4538; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7]
4539; AVX2-FCP-NEXT:    vpbroadcastd 64(%r9), %ymm4
4540; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
4541; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4542; AVX2-FCP-NEXT:    vmovdqa 64(%rdx), %ymm6
4543; AVX2-FCP-NEXT:    vmovdqa 64(%rcx), %ymm4
4544; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm4[0,1,2,2,4,5,6,6]
4545; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm6[1,1,2,3,5,5,6,7]
4546; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0],ymm1[1],ymm9[2],ymm1[3],ymm9[4],ymm1[5],ymm9[6],ymm1[7]
4547; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
4548; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm2
4549; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4550; AVX2-FCP-NEXT:    vmovdqa 64(%rsi), %ymm12
4551; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7]
4552; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4553; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
4554; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm9 = mem[0],zero,mem[1],zero
4555; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5,6,7]
4556; AVX2-FCP-NEXT:    vpbroadcastd 84(%r9), %ymm9
4557; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3],ymm1[4,5,6,7]
4558; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4559; AVX2-FCP-NEXT:    vpbroadcastd %xmm15, %xmm1
4560; AVX2-FCP-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
4561; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
4562; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
4563; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
4564; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
4565; AVX2-FCP-NEXT:    vpbroadcastq %xmm11, %ymm1
4566; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
4567; AVX2-FCP-NEXT:    vpbroadcastd 96(%r9), %ymm1
4568; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
4569; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4570; AVX2-FCP-NEXT:    vmovdqa 96(%rdx), %ymm3
4571; AVX2-FCP-NEXT:    vmovdqa 96(%rcx), %ymm2
4572; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6]
4573; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm9 = ymm3[1,1,2,3,5,5,6,7]
4574; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2],ymm0[3],ymm9[4],ymm0[5],ymm9[6],ymm0[7]
4575; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
4576; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
4577; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4578; AVX2-FCP-NEXT:    vmovdqa 96(%rsi), %ymm9
4579; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7]
4580; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4581; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
4582; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero
4583; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3],ymm0[4,5,6,7]
4584; AVX2-FCP-NEXT:    vpbroadcastd 116(%r9), %ymm13
4585; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3],ymm0[4,5,6,7]
4586; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4587; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4588; AVX2-FCP-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4589; AVX2-FCP-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
4590; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
4591; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4592; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm13
4593; AVX2-FCP-NEXT:    vmovdqa (%r8), %ymm0
4594; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [2,2,3,3,2,2,3,3]
4595; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
4596; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm1, %ymm14
4597; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7]
4598; AVX2-FCP-NEXT:    vmovdqa (%r9), %ymm14
4599; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm1, %ymm11
4600; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3,4,5,6],ymm11[7]
4601; AVX2-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4602; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
4603; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
4604; AVX2-FCP-NEXT:    # ymm11 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5]
4605; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm13 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5]
4606; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2]
4607; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3]
4608; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7]
4609; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7]
4610; AVX2-FCP-NEXT:    vpbroadcastd 16(%r9), %ymm13
4611; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7]
4612; AVX2-FCP-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4613; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm5 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7]
4614; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
4615; AVX2-FCP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
4616; AVX2-FCP-NEXT:    # ymm5 = mem[2,3],ymm5[2,3]
4617; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7]
4618; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm7, %ymm0
4619; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5],ymm0[6,7]
4620; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [4,6,2,3,4,6,6,7]
4621; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm15, %ymm5
4622; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6],ymm5[7]
4623; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4624; AVX2-FCP-NEXT:    vpunpckhdq (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload
4625; AVX2-FCP-NEXT:    # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
4626; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
4627; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
4628; AVX2-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm11, %ymm5
4629; AVX2-FCP-NEXT:    vmovdqa 32(%r8), %ymm11
4630; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm1, %ymm13
4631; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3,4,5],ymm13[6,7]
4632; AVX2-FCP-NEXT:    vmovdqa 32(%r9), %ymm14
4633; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm1, %ymm13
4634; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2,3,4,5,6],ymm13[7]
4635; AVX2-FCP-NEXT:    vmovdqu %ymm5, (%rsp) # 32-byte Spill
4636; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
4637; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
4638; AVX2-FCP-NEXT:    # ymm5 = ymm5[0],mem[0],ymm5[1],mem[1],ymm5[4],mem[4],ymm5[5],mem[5]
4639; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm13 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5]
4640; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2]
4641; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3]
4642; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7]
4643; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5],ymm5[6,7]
4644; AVX2-FCP-NEXT:    vpbroadcastd 48(%r9), %ymm13
4645; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3,4],ymm13[5],ymm5[6,7]
4646; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm5 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7]
4647; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
4648; AVX2-FCP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
4649; AVX2-FCP-NEXT:    # ymm5 = mem[2,3],ymm5[2,3]
4650; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm7, %ymm8
4651; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5],ymm8[6,7]
4652; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm15, %ymm8
4653; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4,5,6],ymm8[7]
4654; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4655; AVX2-FCP-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
4656; AVX2-FCP-NEXT:    # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3]
4657; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3]
4658; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
4659; AVX2-FCP-NEXT:    vinserti128 $1, %xmm8, %ymm10, %ymm8
4660; AVX2-FCP-NEXT:    vmovdqa 64(%r8), %ymm10
4661; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm1, %ymm11
4662; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5],ymm11[6,7]
4663; AVX2-FCP-NEXT:    vmovdqa 64(%r9), %ymm11
4664; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm1, %ymm14
4665; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2,3,4,5,6],ymm14[7]
4666; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
4667; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm12 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[4],ymm12[4],ymm14[5],ymm12[5]
4668; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm14 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[4],ymm4[4],ymm6[5],ymm4[5]
4669; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
4670; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3]
4671; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7]
4672; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7]
4673; AVX2-FCP-NEXT:    vpbroadcastd 80(%r9), %ymm14
4674; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
4675; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm4 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7]
4676; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7]
4677; AVX2-FCP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
4678; AVX2-FCP-NEXT:    # ymm4 = mem[2,3],ymm4[2,3]
4679; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm7, %ymm6
4680; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5],ymm6[6,7]
4681; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm15, %ymm6
4682; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6],ymm6[7]
4683; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4684; AVX2-FCP-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
4685; AVX2-FCP-NEXT:    # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3]
4686; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
4687; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
4688; AVX2-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm10, %ymm6
4689; AVX2-FCP-NEXT:    vmovdqa 96(%r8), %ymm10
4690; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm1, %ymm11
4691; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3,4,5],ymm11[6,7]
4692; AVX2-FCP-NEXT:    vmovdqa 96(%r9), %ymm11
4693; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm1, %ymm1
4694; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3,4,5,6],ymm1[7]
4695; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
4696; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5]
4697; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm9 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
4698; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2]
4699; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
4700; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3],ymm6[4,5,6,7]
4701; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5],ymm6[6,7]
4702; AVX2-FCP-NEXT:    vpbroadcastd 112(%r9), %ymm9
4703; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5],ymm6[6,7]
4704; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
4705; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7]
4706; AVX2-FCP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
4707; AVX2-FCP-NEXT:    # ymm2 = mem[2,3],ymm2[2,3]
4708; AVX2-FCP-NEXT:    vpermd %ymm10, %ymm7, %ymm3
4709; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7]
4710; AVX2-FCP-NEXT:    vpermd %ymm11, %ymm15, %ymm3
4711; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7]
4712; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4713; AVX2-FCP-NEXT:    vmovdqa %ymm2, 736(%rax)
4714; AVX2-FCP-NEXT:    vmovdqa %ymm6, 672(%rax)
4715; AVX2-FCP-NEXT:    vmovdqa %ymm1, 640(%rax)
4716; AVX2-FCP-NEXT:    vmovdqa %ymm4, 544(%rax)
4717; AVX2-FCP-NEXT:    vmovdqa %ymm12, 480(%rax)
4718; AVX2-FCP-NEXT:    vmovdqa %ymm8, 448(%rax)
4719; AVX2-FCP-NEXT:    vmovdqa %ymm5, 352(%rax)
4720; AVX2-FCP-NEXT:    vmovdqa %ymm13, 288(%rax)
4721; AVX2-FCP-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
4722; AVX2-FCP-NEXT:    vmovaps %ymm1, 256(%rax)
4723; AVX2-FCP-NEXT:    vmovdqa %ymm0, 160(%rax)
4724; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4725; AVX2-FCP-NEXT:    vmovaps %ymm0, 96(%rax)
4726; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4727; AVX2-FCP-NEXT:    vmovaps %ymm0, 64(%rax)
4728; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4729; AVX2-FCP-NEXT:    vmovaps %ymm0, 704(%rax)
4730; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4731; AVX2-FCP-NEXT:    vmovaps %ymm0, 576(%rax)
4732; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4733; AVX2-FCP-NEXT:    vmovaps %ymm0, 512(%rax)
4734; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4735; AVX2-FCP-NEXT:    vmovaps %ymm0, 384(%rax)
4736; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4737; AVX2-FCP-NEXT:    vmovaps %ymm0, 320(%rax)
4738; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4739; AVX2-FCP-NEXT:    vmovaps %ymm0, 192(%rax)
4740; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4741; AVX2-FCP-NEXT:    vmovaps %ymm0, 128(%rax)
4742; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4743; AVX2-FCP-NEXT:    vmovaps %ymm0, (%rax)
4744; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4745; AVX2-FCP-NEXT:    vmovaps %ymm0, 608(%rax)
4746; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4747; AVX2-FCP-NEXT:    vmovaps %ymm0, 416(%rax)
4748; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4749; AVX2-FCP-NEXT:    vmovaps %ymm0, 224(%rax)
4750; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4751; AVX2-FCP-NEXT:    vmovaps %ymm0, 32(%rax)
4752; AVX2-FCP-NEXT:    addq $872, %rsp # imm = 0x368
4753; AVX2-FCP-NEXT:    vzeroupper
4754; AVX2-FCP-NEXT:    retq
4755;
4756; AVX512-LABEL: store_i32_stride6_vf32:
4757; AVX512:       # %bb.0:
4758; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
4759; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm5
4760; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm17
4761; AVX512-NEXT:    vmovdqa64 (%rsi), %zmm18
4762; AVX512-NEXT:    vmovdqa64 64(%rsi), %zmm7
4763; AVX512-NEXT:    vmovdqa64 (%rdx), %zmm1
4764; AVX512-NEXT:    vmovdqa64 64(%rdx), %zmm0
4765; AVX512-NEXT:    vmovdqa64 (%rcx), %zmm6
4766; AVX512-NEXT:    vmovdqa64 64(%rcx), %zmm10
4767; AVX512-NEXT:    vmovdqa64 (%r8), %zmm3
4768; AVX512-NEXT:    vmovdqa64 64(%r8), %zmm8
4769; AVX512-NEXT:    vmovdqa64 (%r9), %zmm4
4770; AVX512-NEXT:    vmovdqa64 64(%r9), %zmm9
4771; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
4772; AVX512-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
4773; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
4774; AVX512-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
4775; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm11
4776; AVX512-NEXT:    vpermt2d %zmm7, %zmm12, %zmm11
4777; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
4778; AVX512-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
4779; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm13
4780; AVX512-NEXT:    vpermt2d %zmm7, %zmm14, %zmm13
4781; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
4782; AVX512-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
4783; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm15
4784; AVX512-NEXT:    vpermt2d %zmm7, %zmm16, %zmm15
4785; AVX512-NEXT:    vpermi2d %zmm18, %zmm17, %zmm12
4786; AVX512-NEXT:    vpermi2d %zmm18, %zmm17, %zmm14
4787; AVX512-NEXT:    vpermi2d %zmm18, %zmm17, %zmm16
4788; AVX512-NEXT:    vpunpckhdq {{.*#+}} zmm19 = zmm17[2],zmm18[2],zmm17[3],zmm18[3],zmm17[6],zmm18[6],zmm17[7],zmm18[7],zmm17[10],zmm18[10],zmm17[11],zmm18[11],zmm17[14],zmm18[14],zmm17[15],zmm18[15]
4789; AVX512-NEXT:    vpermt2d %zmm18, %zmm2, %zmm17
4790; AVX512-NEXT:    vmovdqa64 (%rdx), %ymm18
4791; AVX512-NEXT:    vmovdqa64 64(%rdx), %ymm20
4792; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12]
4793; AVX512-NEXT:    vpermt2d (%rcx), %ymm21, %ymm18
4794; AVX512-NEXT:    movb $36, %dl
4795; AVX512-NEXT:    kmovw %edx, %k1
4796; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7]
4797; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15]
4798; AVX512-NEXT:    vpermt2d %zmm3, %zmm18, %zmm17
4799; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
4800; AVX512-NEXT:    vpermt2d %zmm4, %zmm22, %zmm17
4801; AVX512-NEXT:    vpermi2d %zmm7, %zmm5, %zmm2
4802; AVX512-NEXT:    vpermt2d 64(%rcx), %ymm21, %ymm20
4803; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm20[0,1,0,1,2,3,6,7]
4804; AVX512-NEXT:    vpermt2d %zmm8, %zmm18, %zmm2
4805; AVX512-NEXT:    vpermt2d %zmm9, %zmm22, %zmm2
4806; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28]
4807; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm20
4808; AVX512-NEXT:    vpermt2d %zmm10, %zmm18, %zmm20
4809; AVX512-NEXT:    vmovdqa64 %zmm20, %zmm11 {%k1}
4810; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15]
4811; AVX512-NEXT:    vpermt2d %zmm8, %zmm20, %zmm11
4812; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
4813; AVX512-NEXT:    vpermt2d %zmm9, %zmm21, %zmm11
4814; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26]
4815; AVX512-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
4816; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm23
4817; AVX512-NEXT:    vpermt2d %zmm10, %zmm22, %zmm23
4818; AVX512-NEXT:    movb $-110, %cl
4819; AVX512-NEXT:    kmovw %ecx, %k2
4820; AVX512-NEXT:    vmovdqa64 %zmm23, %zmm13 {%k2}
4821; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm23 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15]
4822; AVX512-NEXT:    vpermt2d %zmm8, %zmm23, %zmm13
4823; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
4824; AVX512-NEXT:    vpermt2d %zmm9, %zmm24, %zmm13
4825; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
4826; AVX512-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
4827; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm26
4828; AVX512-NEXT:    vpermt2d %zmm10, %zmm25, %zmm26
4829; AVX512-NEXT:    vmovdqa64 %zmm26, %zmm15 {%k2}
4830; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15]
4831; AVX512-NEXT:    vpermt2d %zmm8, %zmm26, %zmm15
4832; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
4833; AVX512-NEXT:    vpermt2d %zmm9, %zmm27, %zmm15
4834; AVX512-NEXT:    vpermi2d %zmm6, %zmm1, %zmm18
4835; AVX512-NEXT:    vmovdqa64 %zmm18, %zmm12 {%k1}
4836; AVX512-NEXT:    vpermt2d %zmm3, %zmm20, %zmm12
4837; AVX512-NEXT:    vpermt2d %zmm4, %zmm21, %zmm12
4838; AVX512-NEXT:    vpermi2d %zmm6, %zmm1, %zmm22
4839; AVX512-NEXT:    vmovdqa64 %zmm22, %zmm14 {%k2}
4840; AVX512-NEXT:    vpermt2d %zmm3, %zmm23, %zmm14
4841; AVX512-NEXT:    vpermt2d %zmm4, %zmm24, %zmm14
4842; AVX512-NEXT:    vpermi2d %zmm6, %zmm1, %zmm25
4843; AVX512-NEXT:    vmovdqa64 %zmm25, %zmm16 {%k2}
4844; AVX512-NEXT:    vpermt2d %zmm3, %zmm26, %zmm16
4845; AVX512-NEXT:    vpermt2d %zmm4, %zmm27, %zmm16
4846; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22]
4847; AVX512-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
4848; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm20
4849; AVX512-NEXT:    vpermt2d %zmm10, %zmm18, %zmm20
4850; AVX512-NEXT:    vmovdqa64 (%rdi), %ymm21
4851; AVX512-NEXT:    vmovdqa64 64(%rdi), %ymm22
4852; AVX512-NEXT:    vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7]
4853; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3]
4854; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0]
4855; AVX512-NEXT:    vpermt2d %zmm8, %zmm22, %zmm20
4856; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
4857; AVX512-NEXT:    vpermt2d %zmm9, %zmm23, %zmm20
4858; AVX512-NEXT:    vpermi2d %zmm6, %zmm1, %zmm18
4859; AVX512-NEXT:    vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7]
4860; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm21[2,3,2,3,2,3,2,3]
4861; AVX512-NEXT:    vpermt2d %zmm3, %zmm22, %zmm18
4862; AVX512-NEXT:    vpermt2d %zmm4, %zmm23, %zmm18
4863; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30]
4864; AVX512-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
4865; AVX512-NEXT:    vpermt2d %zmm10, %zmm21, %zmm0
4866; AVX512-NEXT:    vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15]
4867; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7]
4868; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0]
4869; AVX512-NEXT:    vpermt2d %zmm8, %zmm5, %zmm0
4870; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
4871; AVX512-NEXT:    vpermt2d %zmm9, %zmm7, %zmm0
4872; AVX512-NEXT:    vpermt2d %zmm6, %zmm21, %zmm1
4873; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[6,7,6,7,6,7,6,7]
4874; AVX512-NEXT:    vpermt2d %zmm3, %zmm5, %zmm1
4875; AVX512-NEXT:    vpermt2d %zmm4, %zmm7, %zmm1
4876; AVX512-NEXT:    vmovdqa64 %zmm16, (%rax)
4877; AVX512-NEXT:    vmovdqa64 %zmm14, 192(%rax)
4878; AVX512-NEXT:    vmovdqa64 %zmm1, 320(%rax)
4879; AVX512-NEXT:    vmovdqa64 %zmm12, 256(%rax)
4880; AVX512-NEXT:    vmovdqa64 %zmm15, 384(%rax)
4881; AVX512-NEXT:    vmovdqa64 %zmm13, 576(%rax)
4882; AVX512-NEXT:    vmovdqa64 %zmm0, 704(%rax)
4883; AVX512-NEXT:    vmovdqa64 %zmm11, 640(%rax)
4884; AVX512-NEXT:    vmovdqa64 %zmm18, 128(%rax)
4885; AVX512-NEXT:    vmovdqa64 %zmm2, 448(%rax)
4886; AVX512-NEXT:    vmovdqa64 %zmm20, 512(%rax)
4887; AVX512-NEXT:    vmovdqa64 %zmm17, 64(%rax)
4888; AVX512-NEXT:    vzeroupper
4889; AVX512-NEXT:    retq
4890;
4891; AVX512-FCP-LABEL: store_i32_stride6_vf32:
4892; AVX512-FCP:       # %bb.0:
4893; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
4894; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm0
4895; AVX512-FCP-NEXT:    vmovdqa64 (%rsi), %zmm11
4896; AVX512-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm13
4897; AVX512-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
4898; AVX512-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm7
4899; AVX512-FCP-NEXT:    vmovdqa64 (%rcx), %zmm18
4900; AVX512-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm24
4901; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
4902; AVX512-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
4903; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm14
4904; AVX512-FCP-NEXT:    vpermt2d %zmm18, %zmm26, %zmm14
4905; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
4906; AVX512-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
4907; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
4908; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm19, %zmm2
4909; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31]
4910; AVX512-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4911; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm20
4912; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm12, %zmm20
4913; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30]
4914; AVX512-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
4915; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm3
4916; AVX512-FCP-NEXT:    vpermt2d %zmm24, %zmm5, %zmm3
4917; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28]
4918; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm22
4919; AVX512-FCP-NEXT:    vpermt2d %zmm24, %zmm17, %zmm22
4920; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
4921; AVX512-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
4922; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm6
4923; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm8, %zmm6
4924; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26]
4925; AVX512-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
4926; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm23
4927; AVX512-FCP-NEXT:    vpermt2d %zmm24, %zmm21, %zmm23
4928; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
4929; AVX512-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
4930; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9
4931; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm10, %zmm9
4932; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22]
4933; AVX512-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
4934; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm15
4935; AVX512-FCP-NEXT:    vpermt2d %zmm24, %zmm16, %zmm15
4936; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm25
4937; AVX512-FCP-NEXT:    vpermt2d %zmm24, %zmm26, %zmm7
4938; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0]
4939; AVX512-FCP-NEXT:    vpermt2d %zmm24, %zmm26, %zmm25
4940; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23]
4941; AVX512-FCP-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4942; AVX512-FCP-NEXT:    vpermi2d %zmm18, %zmm4, %zmm5
4943; AVX512-FCP-NEXT:    vpermi2d %zmm18, %zmm4, %zmm17
4944; AVX512-FCP-NEXT:    vpermi2d %zmm18, %zmm4, %zmm21
4945; AVX512-FCP-NEXT:    vpermi2d %zmm18, %zmm4, %zmm16
4946; AVX512-FCP-NEXT:    vpermt2d %zmm18, %zmm26, %zmm4
4947; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm26
4948; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm24, %zmm26
4949; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm18
4950; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm19, %zmm0
4951; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
4952; AVX512-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
4953; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm19, %zmm18
4954; AVX512-FCP-NEXT:    vpermi2d %zmm11, %zmm1, %zmm12
4955; AVX512-FCP-NEXT:    vpermi2d %zmm11, %zmm1, %zmm8
4956; AVX512-FCP-NEXT:    vpermi2d %zmm11, %zmm1, %zmm10
4957; AVX512-FCP-NEXT:    vpermi2d %zmm11, %zmm1, %zmm24
4958; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm19, %zmm1
4959; AVX512-FCP-NEXT:    movb $-110, %al
4960; AVX512-FCP-NEXT:    kmovw %eax, %k2
4961; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm2 {%k2}
4962; AVX512-FCP-NEXT:    vmovdqa64 (%r8), %zmm11
4963; AVX512-FCP-NEXT:    movb $36, %al
4964; AVX512-FCP-NEXT:    kmovw %eax, %k1
4965; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, %zmm3 {%k1}
4966; AVX512-FCP-NEXT:    vmovdqa64 64(%r8), %zmm13
4967; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm6 {%k1}
4968; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15]
4969; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm14, %zmm2
4970; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, %zmm9 {%k2}
4971; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0]
4972; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm19, %zmm3
4973; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, %zmm15 {%k1}
4974; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15]
4975; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm20, %zmm6
4976; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm18 {%k1}
4977; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15]
4978; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm22, %zmm9
4979; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm0 {%k2}
4980; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0]
4981; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm7, %zmm15
4982; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm14, %zmm0
4983; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15]
4984; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm14, %zmm18
4985; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm5 {%k1}
4986; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm19, %zmm5
4987; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm8 {%k1}
4988; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm20, %zmm8
4989; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm10 {%k2}
4990; AVX512-FCP-NEXT:    vmovdqa64 (%r9), %zmm12
4991; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm22, %zmm10
4992; AVX512-FCP-NEXT:    vmovdqa64 64(%r9), %zmm13
4993; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, %zmm16 {%k1}
4994; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
4995; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm17, %zmm2
4996; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm7, %zmm16
4997; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
4998; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm7, %zmm3
4999; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1 {%k1}
5000; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
5001; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm4, %zmm6
5002; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm14, %zmm1
5003; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
5004; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm11, %zmm9
5005; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
5006; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm14, %zmm15
5007; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm17, %zmm0
5008; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
5009; AVX512-FCP-NEXT:    vpermt2d %zmm13, %zmm17, %zmm18
5010; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm7, %zmm5
5011; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
5012; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm11, %zmm10
5013; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm14, %zmm16
5014; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm17, %zmm1
5015; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5016; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
5017; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, 128(%rax)
5018; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, 192(%rax)
5019; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 256(%rax)
5020; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 320(%rax)
5021; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 384(%rax)
5022; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, 448(%rax)
5023; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, 512(%rax)
5024; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, 576(%rax)
5025; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, 640(%rax)
5026; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 704(%rax)
5027; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, (%rax)
5028; AVX512-FCP-NEXT:    vzeroupper
5029; AVX512-FCP-NEXT:    retq
5030;
5031; AVX512DQ-LABEL: store_i32_stride6_vf32:
5032; AVX512DQ:       # %bb.0:
5033; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5034; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm5
5035; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm17
5036; AVX512DQ-NEXT:    vmovdqa64 (%rsi), %zmm18
5037; AVX512DQ-NEXT:    vmovdqa64 64(%rsi), %zmm7
5038; AVX512DQ-NEXT:    vmovdqa64 (%rdx), %zmm1
5039; AVX512DQ-NEXT:    vmovdqa64 64(%rdx), %zmm0
5040; AVX512DQ-NEXT:    vmovdqa64 (%rcx), %zmm6
5041; AVX512DQ-NEXT:    vmovdqa64 64(%rcx), %zmm10
5042; AVX512DQ-NEXT:    vmovdqa64 (%r8), %zmm3
5043; AVX512DQ-NEXT:    vmovdqa64 64(%r8), %zmm8
5044; AVX512DQ-NEXT:    vmovdqa64 (%r9), %zmm4
5045; AVX512DQ-NEXT:    vmovdqa64 64(%r9), %zmm9
5046; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
5047; AVX512DQ-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
5048; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
5049; AVX512DQ-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
5050; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm11
5051; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm12, %zmm11
5052; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
5053; AVX512DQ-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
5054; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm13
5055; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm14, %zmm13
5056; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
5057; AVX512DQ-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
5058; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm15
5059; AVX512DQ-NEXT:    vpermt2d %zmm7, %zmm16, %zmm15
5060; AVX512DQ-NEXT:    vpermi2d %zmm18, %zmm17, %zmm12
5061; AVX512DQ-NEXT:    vpermi2d %zmm18, %zmm17, %zmm14
5062; AVX512DQ-NEXT:    vpermi2d %zmm18, %zmm17, %zmm16
5063; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} zmm19 = zmm17[2],zmm18[2],zmm17[3],zmm18[3],zmm17[6],zmm18[6],zmm17[7],zmm18[7],zmm17[10],zmm18[10],zmm17[11],zmm18[11],zmm17[14],zmm18[14],zmm17[15],zmm18[15]
5064; AVX512DQ-NEXT:    vpermt2d %zmm18, %zmm2, %zmm17
5065; AVX512DQ-NEXT:    vmovdqa64 (%rdx), %ymm18
5066; AVX512DQ-NEXT:    vmovdqa64 64(%rdx), %ymm20
5067; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12]
5068; AVX512DQ-NEXT:    vpermt2d (%rcx), %ymm21, %ymm18
5069; AVX512DQ-NEXT:    movb $36, %dl
5070; AVX512DQ-NEXT:    kmovw %edx, %k1
5071; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7]
5072; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15]
5073; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm18, %zmm17
5074; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
5075; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm22, %zmm17
5076; AVX512DQ-NEXT:    vpermi2d %zmm7, %zmm5, %zmm2
5077; AVX512DQ-NEXT:    vpermt2d 64(%rcx), %ymm21, %ymm20
5078; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm20[0,1,0,1,2,3,6,7]
5079; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm18, %zmm2
5080; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm22, %zmm2
5081; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28]
5082; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm20
5083; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm18, %zmm20
5084; AVX512DQ-NEXT:    vmovdqa64 %zmm20, %zmm11 {%k1}
5085; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15]
5086; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm20, %zmm11
5087; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
5088; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm21, %zmm11
5089; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26]
5090; AVX512DQ-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
5091; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm23
5092; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm22, %zmm23
5093; AVX512DQ-NEXT:    movb $-110, %cl
5094; AVX512DQ-NEXT:    kmovw %ecx, %k2
5095; AVX512DQ-NEXT:    vmovdqa64 %zmm23, %zmm13 {%k2}
5096; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm23 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15]
5097; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm23, %zmm13
5098; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
5099; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm24, %zmm13
5100; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
5101; AVX512DQ-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
5102; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm26
5103; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm25, %zmm26
5104; AVX512DQ-NEXT:    vmovdqa64 %zmm26, %zmm15 {%k2}
5105; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15]
5106; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm26, %zmm15
5107; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
5108; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm27, %zmm15
5109; AVX512DQ-NEXT:    vpermi2d %zmm6, %zmm1, %zmm18
5110; AVX512DQ-NEXT:    vmovdqa64 %zmm18, %zmm12 {%k1}
5111; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm20, %zmm12
5112; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm21, %zmm12
5113; AVX512DQ-NEXT:    vpermi2d %zmm6, %zmm1, %zmm22
5114; AVX512DQ-NEXT:    vmovdqa64 %zmm22, %zmm14 {%k2}
5115; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm23, %zmm14
5116; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm24, %zmm14
5117; AVX512DQ-NEXT:    vpermi2d %zmm6, %zmm1, %zmm25
5118; AVX512DQ-NEXT:    vmovdqa64 %zmm25, %zmm16 {%k2}
5119; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm26, %zmm16
5120; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm27, %zmm16
5121; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22]
5122; AVX512DQ-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
5123; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm20
5124; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm18, %zmm20
5125; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %ymm21
5126; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %ymm22
5127; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7]
5128; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3]
5129; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0]
5130; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm22, %zmm20
5131; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
5132; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm23, %zmm20
5133; AVX512DQ-NEXT:    vpermi2d %zmm6, %zmm1, %zmm18
5134; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7]
5135; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm21[2,3,2,3,2,3,2,3]
5136; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm22, %zmm18
5137; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm23, %zmm18
5138; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30]
5139; AVX512DQ-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
5140; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm21, %zmm0
5141; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15]
5142; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7]
5143; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0]
5144; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm5, %zmm0
5145; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
5146; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm7, %zmm0
5147; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm21, %zmm1
5148; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[6,7,6,7,6,7,6,7]
5149; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm5, %zmm1
5150; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm7, %zmm1
5151; AVX512DQ-NEXT:    vmovdqa64 %zmm16, (%rax)
5152; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 192(%rax)
5153; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 320(%rax)
5154; AVX512DQ-NEXT:    vmovdqa64 %zmm12, 256(%rax)
5155; AVX512DQ-NEXT:    vmovdqa64 %zmm15, 384(%rax)
5156; AVX512DQ-NEXT:    vmovdqa64 %zmm13, 576(%rax)
5157; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 704(%rax)
5158; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 640(%rax)
5159; AVX512DQ-NEXT:    vmovdqa64 %zmm18, 128(%rax)
5160; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 448(%rax)
5161; AVX512DQ-NEXT:    vmovdqa64 %zmm20, 512(%rax)
5162; AVX512DQ-NEXT:    vmovdqa64 %zmm17, 64(%rax)
5163; AVX512DQ-NEXT:    vzeroupper
5164; AVX512DQ-NEXT:    retq
5165;
5166; AVX512DQ-FCP-LABEL: store_i32_stride6_vf32:
5167; AVX512DQ-FCP:       # %bb.0:
5168; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
5169; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm0
5170; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rsi), %zmm11
5171; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm13
5172; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
5173; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm7
5174; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rcx), %zmm18
5175; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm24
5176; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
5177; AVX512DQ-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
5178; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm14
5179; AVX512DQ-FCP-NEXT:    vpermt2d %zmm18, %zmm26, %zmm14
5180; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
5181; AVX512DQ-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
5182; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
5183; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm19, %zmm2
5184; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31]
5185; AVX512DQ-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5186; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm20
5187; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm12, %zmm20
5188; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30]
5189; AVX512DQ-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
5190; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm3
5191; AVX512DQ-FCP-NEXT:    vpermt2d %zmm24, %zmm5, %zmm3
5192; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28]
5193; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm22
5194; AVX512DQ-FCP-NEXT:    vpermt2d %zmm24, %zmm17, %zmm22
5195; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
5196; AVX512DQ-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
5197; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm6
5198; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm8, %zmm6
5199; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26]
5200; AVX512DQ-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
5201; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm23
5202; AVX512DQ-FCP-NEXT:    vpermt2d %zmm24, %zmm21, %zmm23
5203; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
5204; AVX512DQ-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
5205; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9
5206; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm10, %zmm9
5207; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22]
5208; AVX512DQ-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
5209; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm15
5210; AVX512DQ-FCP-NEXT:    vpermt2d %zmm24, %zmm16, %zmm15
5211; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm25
5212; AVX512DQ-FCP-NEXT:    vpermt2d %zmm24, %zmm26, %zmm7
5213; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0]
5214; AVX512DQ-FCP-NEXT:    vpermt2d %zmm24, %zmm26, %zmm25
5215; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23]
5216; AVX512DQ-FCP-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5217; AVX512DQ-FCP-NEXT:    vpermi2d %zmm18, %zmm4, %zmm5
5218; AVX512DQ-FCP-NEXT:    vpermi2d %zmm18, %zmm4, %zmm17
5219; AVX512DQ-FCP-NEXT:    vpermi2d %zmm18, %zmm4, %zmm21
5220; AVX512DQ-FCP-NEXT:    vpermi2d %zmm18, %zmm4, %zmm16
5221; AVX512DQ-FCP-NEXT:    vpermt2d %zmm18, %zmm26, %zmm4
5222; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm26
5223; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm24, %zmm26
5224; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm18
5225; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm19, %zmm0
5226; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
5227; AVX512DQ-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
5228; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm19, %zmm18
5229; AVX512DQ-FCP-NEXT:    vpermi2d %zmm11, %zmm1, %zmm12
5230; AVX512DQ-FCP-NEXT:    vpermi2d %zmm11, %zmm1, %zmm8
5231; AVX512DQ-FCP-NEXT:    vpermi2d %zmm11, %zmm1, %zmm10
5232; AVX512DQ-FCP-NEXT:    vpermi2d %zmm11, %zmm1, %zmm24
5233; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm19, %zmm1
5234; AVX512DQ-FCP-NEXT:    movb $-110, %al
5235; AVX512DQ-FCP-NEXT:    kmovw %eax, %k2
5236; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm2 {%k2}
5237; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r8), %zmm11
5238; AVX512DQ-FCP-NEXT:    movb $36, %al
5239; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
5240; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, %zmm3 {%k1}
5241; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%r8), %zmm13
5242; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm6 {%k1}
5243; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15]
5244; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm14, %zmm2
5245; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, %zmm9 {%k2}
5246; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0]
5247; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm19, %zmm3
5248; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, %zmm15 {%k1}
5249; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15]
5250; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm20, %zmm6
5251; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm18 {%k1}
5252; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15]
5253; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm22, %zmm9
5254; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm0 {%k2}
5255; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0]
5256; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm7, %zmm15
5257; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm14, %zmm0
5258; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15]
5259; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm14, %zmm18
5260; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm5 {%k1}
5261; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm19, %zmm5
5262; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm8 {%k1}
5263; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm20, %zmm8
5264; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm10 {%k2}
5265; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r9), %zmm12
5266; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm22, %zmm10
5267; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%r9), %zmm13
5268; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, %zmm16 {%k1}
5269; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
5270; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm17, %zmm2
5271; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm7, %zmm16
5272; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
5273; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm7, %zmm3
5274; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1 {%k1}
5275; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
5276; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm4, %zmm6
5277; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm14, %zmm1
5278; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
5279; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm11, %zmm9
5280; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
5281; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm14, %zmm15
5282; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm17, %zmm0
5283; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
5284; AVX512DQ-FCP-NEXT:    vpermt2d %zmm13, %zmm17, %zmm18
5285; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm7, %zmm5
5286; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
5287; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm11, %zmm10
5288; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm14, %zmm16
5289; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm17, %zmm1
5290; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5291; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
5292; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, 128(%rax)
5293; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, 192(%rax)
5294; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 256(%rax)
5295; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 320(%rax)
5296; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 384(%rax)
5297; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, 448(%rax)
5298; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, 512(%rax)
5299; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, 576(%rax)
5300; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, 640(%rax)
5301; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 704(%rax)
5302; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, (%rax)
5303; AVX512DQ-FCP-NEXT:    vzeroupper
5304; AVX512DQ-FCP-NEXT:    retq
5305;
5306; AVX512BW-LABEL: store_i32_stride6_vf32:
5307; AVX512BW:       # %bb.0:
5308; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5309; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm5
5310; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm17
5311; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm18
5312; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm7
5313; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm1
5314; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm0
5315; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm6
5316; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm10
5317; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm3
5318; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm8
5319; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm4
5320; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm9
5321; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
5322; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
5323; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
5324; AVX512BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
5325; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm11
5326; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm12, %zmm11
5327; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
5328; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
5329; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm13
5330; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm14, %zmm13
5331; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
5332; AVX512BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
5333; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm15
5334; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm16, %zmm15
5335; AVX512BW-NEXT:    vpermi2d %zmm18, %zmm17, %zmm12
5336; AVX512BW-NEXT:    vpermi2d %zmm18, %zmm17, %zmm14
5337; AVX512BW-NEXT:    vpermi2d %zmm18, %zmm17, %zmm16
5338; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} zmm19 = zmm17[2],zmm18[2],zmm17[3],zmm18[3],zmm17[6],zmm18[6],zmm17[7],zmm18[7],zmm17[10],zmm18[10],zmm17[11],zmm18[11],zmm17[14],zmm18[14],zmm17[15],zmm18[15]
5339; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm2, %zmm17
5340; AVX512BW-NEXT:    vmovdqa64 (%rdx), %ymm18
5341; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %ymm20
5342; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12]
5343; AVX512BW-NEXT:    vpermt2d (%rcx), %ymm21, %ymm18
5344; AVX512BW-NEXT:    movb $36, %dl
5345; AVX512BW-NEXT:    kmovd %edx, %k1
5346; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7]
5347; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15]
5348; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm18, %zmm17
5349; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
5350; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm22, %zmm17
5351; AVX512BW-NEXT:    vpermi2d %zmm7, %zmm5, %zmm2
5352; AVX512BW-NEXT:    vpermt2d 64(%rcx), %ymm21, %ymm20
5353; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm20[0,1,0,1,2,3,6,7]
5354; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm18, %zmm2
5355; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm22, %zmm2
5356; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28]
5357; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm20
5358; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm18, %zmm20
5359; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm11 {%k1}
5360; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15]
5361; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm20, %zmm11
5362; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
5363; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm21, %zmm11
5364; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26]
5365; AVX512BW-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
5366; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm23
5367; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm22, %zmm23
5368; AVX512BW-NEXT:    movb $-110, %cl
5369; AVX512BW-NEXT:    kmovd %ecx, %k2
5370; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm13 {%k2}
5371; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm23 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15]
5372; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm23, %zmm13
5373; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
5374; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm24, %zmm13
5375; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
5376; AVX512BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
5377; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm26
5378; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm25, %zmm26
5379; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm15 {%k2}
5380; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15]
5381; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm26, %zmm15
5382; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
5383; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm27, %zmm15
5384; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm1, %zmm18
5385; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm12 {%k1}
5386; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm20, %zmm12
5387; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm21, %zmm12
5388; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm1, %zmm22
5389; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm14 {%k2}
5390; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm23, %zmm14
5391; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm24, %zmm14
5392; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm1, %zmm25
5393; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm16 {%k2}
5394; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm26, %zmm16
5395; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm27, %zmm16
5396; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22]
5397; AVX512BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
5398; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm20
5399; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm18, %zmm20
5400; AVX512BW-NEXT:    vmovdqa64 (%rdi), %ymm21
5401; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm22
5402; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7]
5403; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3]
5404; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0]
5405; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm22, %zmm20
5406; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
5407; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm23, %zmm20
5408; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm1, %zmm18
5409; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7]
5410; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm21[2,3,2,3,2,3,2,3]
5411; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm22, %zmm18
5412; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm23, %zmm18
5413; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30]
5414; AVX512BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
5415; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm21, %zmm0
5416; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15]
5417; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7]
5418; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0]
5419; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm5, %zmm0
5420; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
5421; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm7, %zmm0
5422; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm21, %zmm1
5423; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[6,7,6,7,6,7,6,7]
5424; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm5, %zmm1
5425; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm7, %zmm1
5426; AVX512BW-NEXT:    vmovdqa64 %zmm16, (%rax)
5427; AVX512BW-NEXT:    vmovdqa64 %zmm14, 192(%rax)
5428; AVX512BW-NEXT:    vmovdqa64 %zmm1, 320(%rax)
5429; AVX512BW-NEXT:    vmovdqa64 %zmm12, 256(%rax)
5430; AVX512BW-NEXT:    vmovdqa64 %zmm15, 384(%rax)
5431; AVX512BW-NEXT:    vmovdqa64 %zmm13, 576(%rax)
5432; AVX512BW-NEXT:    vmovdqa64 %zmm0, 704(%rax)
5433; AVX512BW-NEXT:    vmovdqa64 %zmm11, 640(%rax)
5434; AVX512BW-NEXT:    vmovdqa64 %zmm18, 128(%rax)
5435; AVX512BW-NEXT:    vmovdqa64 %zmm2, 448(%rax)
5436; AVX512BW-NEXT:    vmovdqa64 %zmm20, 512(%rax)
5437; AVX512BW-NEXT:    vmovdqa64 %zmm17, 64(%rax)
5438; AVX512BW-NEXT:    vzeroupper
5439; AVX512BW-NEXT:    retq
5440;
5441; AVX512BW-FCP-LABEL: store_i32_stride6_vf32:
5442; AVX512BW-FCP:       # %bb.0:
5443; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
5444; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm0
5445; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %zmm11
5446; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm13
5447; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
5448; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm7
5449; AVX512BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm18
5450; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm24
5451; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
5452; AVX512BW-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
5453; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm14
5454; AVX512BW-FCP-NEXT:    vpermt2d %zmm18, %zmm26, %zmm14
5455; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
5456; AVX512BW-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
5457; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
5458; AVX512BW-FCP-NEXT:    vpermt2d %zmm11, %zmm19, %zmm2
5459; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31]
5460; AVX512BW-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5461; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm20
5462; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm12, %zmm20
5463; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30]
5464; AVX512BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
5465; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm3
5466; AVX512BW-FCP-NEXT:    vpermt2d %zmm24, %zmm5, %zmm3
5467; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28]
5468; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm22
5469; AVX512BW-FCP-NEXT:    vpermt2d %zmm24, %zmm17, %zmm22
5470; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
5471; AVX512BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
5472; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm6
5473; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm8, %zmm6
5474; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26]
5475; AVX512BW-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
5476; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm23
5477; AVX512BW-FCP-NEXT:    vpermt2d %zmm24, %zmm21, %zmm23
5478; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
5479; AVX512BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
5480; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9
5481; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm10, %zmm9
5482; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22]
5483; AVX512BW-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
5484; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm15
5485; AVX512BW-FCP-NEXT:    vpermt2d %zmm24, %zmm16, %zmm15
5486; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm25
5487; AVX512BW-FCP-NEXT:    vpermt2d %zmm24, %zmm26, %zmm7
5488; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0]
5489; AVX512BW-FCP-NEXT:    vpermt2d %zmm24, %zmm26, %zmm25
5490; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23]
5491; AVX512BW-FCP-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5492; AVX512BW-FCP-NEXT:    vpermi2d %zmm18, %zmm4, %zmm5
5493; AVX512BW-FCP-NEXT:    vpermi2d %zmm18, %zmm4, %zmm17
5494; AVX512BW-FCP-NEXT:    vpermi2d %zmm18, %zmm4, %zmm21
5495; AVX512BW-FCP-NEXT:    vpermi2d %zmm18, %zmm4, %zmm16
5496; AVX512BW-FCP-NEXT:    vpermt2d %zmm18, %zmm26, %zmm4
5497; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm26
5498; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm24, %zmm26
5499; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm18
5500; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm19, %zmm0
5501; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
5502; AVX512BW-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
5503; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm19, %zmm18
5504; AVX512BW-FCP-NEXT:    vpermi2d %zmm11, %zmm1, %zmm12
5505; AVX512BW-FCP-NEXT:    vpermi2d %zmm11, %zmm1, %zmm8
5506; AVX512BW-FCP-NEXT:    vpermi2d %zmm11, %zmm1, %zmm10
5507; AVX512BW-FCP-NEXT:    vpermi2d %zmm11, %zmm1, %zmm24
5508; AVX512BW-FCP-NEXT:    vpermt2d %zmm11, %zmm19, %zmm1
5509; AVX512BW-FCP-NEXT:    movb $-110, %al
5510; AVX512BW-FCP-NEXT:    kmovd %eax, %k2
5511; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm2 {%k2}
5512; AVX512BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm11
5513; AVX512BW-FCP-NEXT:    movb $36, %al
5514; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
5515; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm20, %zmm3 {%k1}
5516; AVX512BW-FCP-NEXT:    vmovdqa64 64(%r8), %zmm13
5517; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm6 {%k1}
5518; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15]
5519; AVX512BW-FCP-NEXT:    vpermt2d %zmm11, %zmm14, %zmm2
5520; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm23, %zmm9 {%k2}
5521; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0]
5522; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm19, %zmm3
5523; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm15 {%k1}
5524; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15]
5525; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm20, %zmm6
5526; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm18 {%k1}
5527; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15]
5528; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm22, %zmm9
5529; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm0 {%k2}
5530; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0]
5531; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm7, %zmm15
5532; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm14, %zmm0
5533; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15]
5534; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm14, %zmm18
5535; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm5 {%k1}
5536; AVX512BW-FCP-NEXT:    vpermt2d %zmm11, %zmm19, %zmm5
5537; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm8 {%k1}
5538; AVX512BW-FCP-NEXT:    vpermt2d %zmm11, %zmm20, %zmm8
5539; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm10 {%k2}
5540; AVX512BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm12
5541; AVX512BW-FCP-NEXT:    vpermt2d %zmm11, %zmm22, %zmm10
5542; AVX512BW-FCP-NEXT:    vmovdqa64 64(%r9), %zmm13
5543; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm16 {%k1}
5544; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
5545; AVX512BW-FCP-NEXT:    vpermt2d %zmm12, %zmm17, %zmm2
5546; AVX512BW-FCP-NEXT:    vpermt2d %zmm11, %zmm7, %zmm16
5547; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
5548; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm7, %zmm3
5549; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1 {%k1}
5550; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
5551; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm4, %zmm6
5552; AVX512BW-FCP-NEXT:    vpermt2d %zmm11, %zmm14, %zmm1
5553; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
5554; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm11, %zmm9
5555; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
5556; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm14, %zmm15
5557; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm17, %zmm0
5558; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
5559; AVX512BW-FCP-NEXT:    vpermt2d %zmm13, %zmm17, %zmm18
5560; AVX512BW-FCP-NEXT:    vpermt2d %zmm12, %zmm7, %zmm5
5561; AVX512BW-FCP-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
5562; AVX512BW-FCP-NEXT:    vpermt2d %zmm12, %zmm11, %zmm10
5563; AVX512BW-FCP-NEXT:    vpermt2d %zmm12, %zmm14, %zmm16
5564; AVX512BW-FCP-NEXT:    vpermt2d %zmm12, %zmm17, %zmm1
5565; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5566; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
5567; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, 128(%rax)
5568; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, 192(%rax)
5569; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, 256(%rax)
5570; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, 320(%rax)
5571; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 384(%rax)
5572; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, 448(%rax)
5573; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, 512(%rax)
5574; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, 576(%rax)
5575; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, 640(%rax)
5576; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, 704(%rax)
5577; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, (%rax)
5578; AVX512BW-FCP-NEXT:    vzeroupper
5579; AVX512BW-FCP-NEXT:    retq
5580;
5581; AVX512DQ-BW-LABEL: store_i32_stride6_vf32:
5582; AVX512DQ-BW:       # %bb.0:
5583; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5584; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm5
5585; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm17
5586; AVX512DQ-BW-NEXT:    vmovdqa64 (%rsi), %zmm18
5587; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rsi), %zmm7
5588; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdx), %zmm1
5589; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdx), %zmm0
5590; AVX512DQ-BW-NEXT:    vmovdqa64 (%rcx), %zmm6
5591; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rcx), %zmm10
5592; AVX512DQ-BW-NEXT:    vmovdqa64 (%r8), %zmm3
5593; AVX512DQ-BW-NEXT:    vmovdqa64 64(%r8), %zmm8
5594; AVX512DQ-BW-NEXT:    vmovdqa64 (%r9), %zmm4
5595; AVX512DQ-BW-NEXT:    vmovdqa64 64(%r9), %zmm9
5596; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
5597; AVX512DQ-BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
5598; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
5599; AVX512DQ-BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
5600; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm11
5601; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm12, %zmm11
5602; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
5603; AVX512DQ-BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
5604; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm13
5605; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm14, %zmm13
5606; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
5607; AVX512DQ-BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
5608; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, %zmm15
5609; AVX512DQ-BW-NEXT:    vpermt2d %zmm7, %zmm16, %zmm15
5610; AVX512DQ-BW-NEXT:    vpermi2d %zmm18, %zmm17, %zmm12
5611; AVX512DQ-BW-NEXT:    vpermi2d %zmm18, %zmm17, %zmm14
5612; AVX512DQ-BW-NEXT:    vpermi2d %zmm18, %zmm17, %zmm16
5613; AVX512DQ-BW-NEXT:    vpunpckhdq {{.*#+}} zmm19 = zmm17[2],zmm18[2],zmm17[3],zmm18[3],zmm17[6],zmm18[6],zmm17[7],zmm18[7],zmm17[10],zmm18[10],zmm17[11],zmm18[11],zmm17[14],zmm18[14],zmm17[15],zmm18[15]
5614; AVX512DQ-BW-NEXT:    vpermt2d %zmm18, %zmm2, %zmm17
5615; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdx), %ymm18
5616; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdx), %ymm20
5617; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12]
5618; AVX512DQ-BW-NEXT:    vpermt2d (%rcx), %ymm21, %ymm18
5619; AVX512DQ-BW-NEXT:    movb $36, %dl
5620; AVX512DQ-BW-NEXT:    kmovd %edx, %k1
5621; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7]
5622; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15]
5623; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm18, %zmm17
5624; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
5625; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm22, %zmm17
5626; AVX512DQ-BW-NEXT:    vpermi2d %zmm7, %zmm5, %zmm2
5627; AVX512DQ-BW-NEXT:    vpermt2d 64(%rcx), %ymm21, %ymm20
5628; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm20[0,1,0,1,2,3,6,7]
5629; AVX512DQ-BW-NEXT:    vpermt2d %zmm8, %zmm18, %zmm2
5630; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm22, %zmm2
5631; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28]
5632; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm20
5633; AVX512DQ-BW-NEXT:    vpermt2d %zmm10, %zmm18, %zmm20
5634; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm20, %zmm11 {%k1}
5635; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15]
5636; AVX512DQ-BW-NEXT:    vpermt2d %zmm8, %zmm20, %zmm11
5637; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
5638; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm21, %zmm11
5639; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26]
5640; AVX512DQ-BW-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
5641; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm23
5642; AVX512DQ-BW-NEXT:    vpermt2d %zmm10, %zmm22, %zmm23
5643; AVX512DQ-BW-NEXT:    movb $-110, %cl
5644; AVX512DQ-BW-NEXT:    kmovd %ecx, %k2
5645; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm23, %zmm13 {%k2}
5646; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm23 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15]
5647; AVX512DQ-BW-NEXT:    vpermt2d %zmm8, %zmm23, %zmm13
5648; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
5649; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm24, %zmm13
5650; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
5651; AVX512DQ-BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
5652; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm26
5653; AVX512DQ-BW-NEXT:    vpermt2d %zmm10, %zmm25, %zmm26
5654; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm26, %zmm15 {%k2}
5655; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15]
5656; AVX512DQ-BW-NEXT:    vpermt2d %zmm8, %zmm26, %zmm15
5657; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
5658; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm27, %zmm15
5659; AVX512DQ-BW-NEXT:    vpermi2d %zmm6, %zmm1, %zmm18
5660; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, %zmm12 {%k1}
5661; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm20, %zmm12
5662; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm21, %zmm12
5663; AVX512DQ-BW-NEXT:    vpermi2d %zmm6, %zmm1, %zmm22
5664; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm22, %zmm14 {%k2}
5665; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm23, %zmm14
5666; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm24, %zmm14
5667; AVX512DQ-BW-NEXT:    vpermi2d %zmm6, %zmm1, %zmm25
5668; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm25, %zmm16 {%k2}
5669; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm26, %zmm16
5670; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm27, %zmm16
5671; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22]
5672; AVX512DQ-BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
5673; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm20
5674; AVX512DQ-BW-NEXT:    vpermt2d %zmm10, %zmm18, %zmm20
5675; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %ymm21
5676; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %ymm22
5677; AVX512DQ-BW-NEXT:    vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7]
5678; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3]
5679; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0]
5680; AVX512DQ-BW-NEXT:    vpermt2d %zmm8, %zmm22, %zmm20
5681; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
5682; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm23, %zmm20
5683; AVX512DQ-BW-NEXT:    vpermi2d %zmm6, %zmm1, %zmm18
5684; AVX512DQ-BW-NEXT:    vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7]
5685; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm21[2,3,2,3,2,3,2,3]
5686; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm22, %zmm18
5687; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm23, %zmm18
5688; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30]
5689; AVX512DQ-BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
5690; AVX512DQ-BW-NEXT:    vpermt2d %zmm10, %zmm21, %zmm0
5691; AVX512DQ-BW-NEXT:    vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15]
5692; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7]
5693; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0]
5694; AVX512DQ-BW-NEXT:    vpermt2d %zmm8, %zmm5, %zmm0
5695; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
5696; AVX512DQ-BW-NEXT:    vpermt2d %zmm9, %zmm7, %zmm0
5697; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm21, %zmm1
5698; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[6,7,6,7,6,7,6,7]
5699; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm5, %zmm1
5700; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm7, %zmm1
5701; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, (%rax)
5702; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, 192(%rax)
5703; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, 320(%rax)
5704; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm12, 256(%rax)
5705; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, 384(%rax)
5706; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm13, 576(%rax)
5707; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, 704(%rax)
5708; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, 640(%rax)
5709; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, 128(%rax)
5710; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, 448(%rax)
5711; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm20, 512(%rax)
5712; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, 64(%rax)
5713; AVX512DQ-BW-NEXT:    vzeroupper
5714; AVX512DQ-BW-NEXT:    retq
5715;
5716; AVX512DQ-BW-FCP-LABEL: store_i32_stride6_vf32:
5717; AVX512DQ-BW-FCP:       # %bb.0:
5718; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm1
5719; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm0
5720; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rsi), %zmm11
5721; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm13
5722; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
5723; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm7
5724; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm18
5725; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm24
5726; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm26 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
5727; AVX512DQ-BW-FCP-NEXT:    # zmm26 = mem[0,1,2,3,0,1,2,3]
5728; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm14
5729; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm18, %zmm26, %zmm14
5730; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
5731; AVX512DQ-BW-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
5732; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2
5733; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm11, %zmm19, %zmm2
5734; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm12 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31]
5735; AVX512DQ-BW-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5736; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm20
5737; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm12, %zmm20
5738; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30]
5739; AVX512DQ-BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
5740; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm3
5741; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm24, %zmm5, %zmm3
5742; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28]
5743; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm22
5744; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm24, %zmm17, %zmm22
5745; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
5746; AVX512DQ-BW-FCP-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
5747; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm6
5748; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm8, %zmm6
5749; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26]
5750; AVX512DQ-BW-FCP-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
5751; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm23
5752; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm24, %zmm21, %zmm23
5753; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
5754; AVX512DQ-BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
5755; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm9
5756; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm10, %zmm9
5757; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22]
5758; AVX512DQ-BW-FCP-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
5759; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm15
5760; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm24, %zmm16, %zmm15
5761; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm25
5762; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm24, %zmm26, %zmm7
5763; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0]
5764; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm24, %zmm26, %zmm25
5765; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23]
5766; AVX512DQ-BW-FCP-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5767; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm18, %zmm4, %zmm5
5768; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm18, %zmm4, %zmm17
5769; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm18, %zmm4, %zmm21
5770; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm18, %zmm4, %zmm16
5771; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm18, %zmm26, %zmm4
5772; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm26
5773; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm24, %zmm26
5774; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm18
5775; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm19, %zmm0
5776; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
5777; AVX512DQ-BW-FCP-NEXT:    # zmm19 = mem[0,1,2,3,0,1,2,3]
5778; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm19, %zmm18
5779; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm11, %zmm1, %zmm12
5780; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm11, %zmm1, %zmm8
5781; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm11, %zmm1, %zmm10
5782; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm11, %zmm1, %zmm24
5783; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm11, %zmm19, %zmm1
5784; AVX512DQ-BW-FCP-NEXT:    movb $-110, %al
5785; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k2
5786; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm2 {%k2}
5787; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm11
5788; AVX512DQ-BW-FCP-NEXT:    movb $36, %al
5789; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
5790; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm20, %zmm3 {%k1}
5791; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%r8), %zmm13
5792; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm6 {%k1}
5793; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15]
5794; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm11, %zmm14, %zmm2
5795; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm23, %zmm9 {%k2}
5796; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0]
5797; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm19, %zmm3
5798; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, %zmm15 {%k1}
5799; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15]
5800; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm20, %zmm6
5801; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm18 {%k1}
5802; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15]
5803; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm22, %zmm9
5804; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm0 {%k2}
5805; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0]
5806; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm7, %zmm15
5807; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm14, %zmm0
5808; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15]
5809; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm14, %zmm18
5810; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm5 {%k1}
5811; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm11, %zmm19, %zmm5
5812; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm8 {%k1}
5813; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm11, %zmm20, %zmm8
5814; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm10 {%k2}
5815; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm12
5816; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm11, %zmm22, %zmm10
5817; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%r9), %zmm13
5818; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm16 {%k1}
5819; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
5820; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm12, %zmm17, %zmm2
5821; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm11, %zmm7, %zmm16
5822; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
5823; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm7, %zmm3
5824; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm1 {%k1}
5825; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
5826; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm4, %zmm6
5827; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm11, %zmm14, %zmm1
5828; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
5829; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm11, %zmm9
5830; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
5831; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm14, %zmm15
5832; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm17, %zmm0
5833; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
5834; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm13, %zmm17, %zmm18
5835; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm12, %zmm7, %zmm5
5836; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm12, %zmm4, %zmm8
5837; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm12, %zmm11, %zmm10
5838; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm12, %zmm14, %zmm16
5839; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm12, %zmm17, %zmm1
5840; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
5841; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
5842; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, 128(%rax)
5843; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, 192(%rax)
5844; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, 256(%rax)
5845; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, 320(%rax)
5846; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 384(%rax)
5847; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, 448(%rax)
5848; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, 512(%rax)
5849; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, 576(%rax)
5850; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, 640(%rax)
5851; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, 704(%rax)
5852; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, (%rax)
5853; AVX512DQ-BW-FCP-NEXT:    vzeroupper
5854; AVX512DQ-BW-FCP-NEXT:    retq
5855  %in.vec0 = load <32 x i32>, ptr %in.vecptr0, align 64
5856  %in.vec1 = load <32 x i32>, ptr %in.vecptr1, align 64
5857  %in.vec2 = load <32 x i32>, ptr %in.vecptr2, align 64
5858  %in.vec3 = load <32 x i32>, ptr %in.vecptr3, align 64
5859  %in.vec4 = load <32 x i32>, ptr %in.vecptr4, align 64
5860  %in.vec5 = load <32 x i32>, ptr %in.vecptr5, align 64
5861  %1 = shufflevector <32 x i32> %in.vec0, <32 x i32> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
5862  %2 = shufflevector <32 x i32> %in.vec2, <32 x i32> %in.vec3, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
5863  %3 = shufflevector <32 x i32> %in.vec4, <32 x i32> %in.vec5, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
5864  %4 = shufflevector <64 x i32> %1, <64 x i32> %2, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
5865  %5 = shufflevector <64 x i32> %3, <64 x i32> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
5866  %6 = shufflevector <128 x i32> %4, <128 x i32> %5, <192 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191>
5867  %interleaved.vec = shufflevector <192 x i32> %6, <192 x i32> poison, <192 x i32> <i32 0, i32 32, i32 64, i32 96, i32 128, i32 160, i32 1, i32 33, i32 65, i32 97, i32 129, i32 161, i32 2, i32 34, i32 66, i32 98, i32 130, i32 162, i32 3, i32 35, i32 67, i32 99, i32 131, i32 163, i32 4, i32 36, i32 68, i32 100, i32 132, i32 164, i32 5, i32 37, i32 69, i32 101, i32 133, i32 165, i32 6, i32 38, i32 70, i32 102, i32 134, i32 166, i32 7, i32 39, i32 71, i32 103, i32 135, i32 167, i32 8, i32 40, i32 72, i32 104, i32 136, i32 168, i32 9, i32 41, i32 73, i32 105, i32 137, i32 169, i32 10, i32 42, i32 74, i32 106, i32 138, i32 170, i32 11, i32 43, i32 75, i32 107, i32 139, i32 171, i32 12, i32 44, i32 76, i32 108, i32 140, i32 172, i32 13, i32 45, i32 77, i32 109, i32 141, i32 173, i32 14, i32 46, i32 78, i32 110, i32 142, i32 174, i32 15, i32 47, i32 79, i32 111, i32 143, i32 175, i32 16, i32 48, i32 80, i32 112, i32 144, i32 176, i32 17, i32 49, i32 81, i32 113, i32 145, i32 177, i32 18, i32 50, i32 82, i32 114, i32 146, i32 178, i32 19, i32 51, i32 83, i32 115, i32 147, i32 179, i32 20, i32 52, i32 84, i32 116, i32 148, i32 180, i32 21, i32 53, i32 85, i32 117, i32 149, i32 181, i32 22, i32 54, i32 86, i32 118, i32 150, i32 182, i32 23, i32 55, i32 87, i32 119, i32 151, i32 183, i32 24, i32 56, i32 88, i32 120, i32 152, i32 184, i32 25, i32 57, i32 89, i32 121, i32 153, i32 185, i32 26, i32 58, i32 90, i32 122, i32 154, i32 186, i32 27, i32 59, i32 91, i32 123, i32 155, i32 187, i32 28, i32 60, i32 92, i32 124, i32 156, i32 188, i32 29, i32 61, i32 93, i32 125, i32 157, i32 189, i32 30, i32 62, i32 94, i32 126, i32 158, i32 190, i32 31, i32 63, i32 95, i32 127, i32 159, i32 191>
5868  store <192 x i32> %interleaved.vec, ptr %out.vec, align 64
5869  ret void
5870}
5871
5872define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind {
5873; SSE-LABEL: store_i32_stride6_vf64:
5874; SSE:       # %bb.0:
5875; SSE-NEXT:    subq $1224, %rsp # imm = 0x4C8
5876; SSE-NEXT:    movaps (%rdi), %xmm9
5877; SSE-NEXT:    movaps 16(%rdi), %xmm10
5878; SSE-NEXT:    movaps (%rsi), %xmm2
5879; SSE-NEXT:    movaps 16(%rsi), %xmm0
5880; SSE-NEXT:    movaps (%rdx), %xmm11
5881; SSE-NEXT:    movaps 16(%rdx), %xmm12
5882; SSE-NEXT:    movaps (%rcx), %xmm4
5883; SSE-NEXT:    movaps 16(%rcx), %xmm1
5884; SSE-NEXT:    movaps (%r8), %xmm6
5885; SSE-NEXT:    movaps 16(%r8), %xmm3
5886; SSE-NEXT:    movaps (%r9), %xmm7
5887; SSE-NEXT:    movaps 16(%r9), %xmm5
5888; SSE-NEXT:    movaps %xmm11, %xmm13
5889; SSE-NEXT:    unpcklps {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1]
5890; SSE-NEXT:    movaps %xmm9, %xmm8
5891; SSE-NEXT:    unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1]
5892; SSE-NEXT:    movaps %xmm7, %xmm14
5893; SSE-NEXT:    movlhps {{.*#+}} xmm14 = xmm14[0],xmm6[0]
5894; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[2,0],xmm8[2,3]
5895; SSE-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5896; SSE-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm13[0]
5897; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5898; SSE-NEXT:    movaps %xmm6, %xmm8
5899; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[1,1],xmm7[1,1]
5900; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[2,3],xmm8[0,2]
5901; SSE-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5902; SSE-NEXT:    movaps %xmm4, %xmm8
5903; SSE-NEXT:    unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm11[1]
5904; SSE-NEXT:    unpckhps {{.*#+}} xmm9 = xmm9[2],xmm2[2],xmm9[3],xmm2[3]
5905; SSE-NEXT:    movaps %xmm6, %xmm2
5906; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1]
5907; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm9[2,3]
5908; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5909; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[0,1],xmm8[2,0]
5910; SSE-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5911; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[3,3],xmm7[3,3]
5912; SSE-NEXT:    unpckhps {{.*#+}} xmm11 = xmm11[2],xmm4[2],xmm11[3],xmm4[3]
5913; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[2,3],xmm6[0,2]
5914; SSE-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5915; SSE-NEXT:    movaps %xmm12, %xmm4
5916; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
5917; SSE-NEXT:    movaps %xmm10, %xmm2
5918; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
5919; SSE-NEXT:    movaps %xmm5, %xmm6
5920; SSE-NEXT:    movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0]
5921; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,3]
5922; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5923; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0]
5924; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5925; SSE-NEXT:    movaps %xmm3, %xmm2
5926; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm5[1,1]
5927; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,3],xmm2[0,2]
5928; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5929; SSE-NEXT:    movaps %xmm1, %xmm2
5930; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm12[1]
5931; SSE-NEXT:    unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3]
5932; SSE-NEXT:    movaps %xmm3, %xmm0
5933; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1]
5934; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm10[2,3]
5935; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5936; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[0,1],xmm2[2,0]
5937; SSE-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5938; SSE-NEXT:    movaps 32(%rdi), %xmm6
5939; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3],xmm5[3,3]
5940; SSE-NEXT:    movaps 32(%rdx), %xmm5
5941; SSE-NEXT:    unpckhps {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3]
5942; SSE-NEXT:    movaps 32(%rcx), %xmm0
5943; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[2,3],xmm3[0,2]
5944; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5945; SSE-NEXT:    movaps %xmm5, %xmm7
5946; SSE-NEXT:    unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
5947; SSE-NEXT:    movaps 32(%rsi), %xmm1
5948; SSE-NEXT:    movaps %xmm6, %xmm4
5949; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
5950; SSE-NEXT:    movaps 32(%r8), %xmm2
5951; SSE-NEXT:    movaps 32(%r9), %xmm3
5952; SSE-NEXT:    movaps %xmm3, %xmm8
5953; SSE-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0]
5954; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3]
5955; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5956; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0]
5957; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5958; SSE-NEXT:    movaps %xmm2, %xmm4
5959; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1]
5960; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,3],xmm4[0,2]
5961; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5962; SSE-NEXT:    unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3]
5963; SSE-NEXT:    movaps %xmm0, %xmm1
5964; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1]
5965; SSE-NEXT:    movaps %xmm2, %xmm4
5966; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
5967; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3]
5968; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5969; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0]
5970; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5971; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3]
5972; SSE-NEXT:    unpckhps {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3]
5973; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,3],xmm2[0,2]
5974; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5975; SSE-NEXT:    movaps 48(%rdx), %xmm6
5976; SSE-NEXT:    movaps 48(%rcx), %xmm0
5977; SSE-NEXT:    movaps %xmm6, %xmm5
5978; SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
5979; SSE-NEXT:    movaps 48(%rdi), %xmm7
5980; SSE-NEXT:    movaps 48(%rsi), %xmm1
5981; SSE-NEXT:    movaps %xmm7, %xmm4
5982; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
5983; SSE-NEXT:    movaps 48(%r8), %xmm2
5984; SSE-NEXT:    movaps 48(%r9), %xmm3
5985; SSE-NEXT:    movaps %xmm3, %xmm8
5986; SSE-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0]
5987; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3]
5988; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5989; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
5990; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5991; SSE-NEXT:    movaps %xmm2, %xmm4
5992; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1]
5993; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2]
5994; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5995; SSE-NEXT:    unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3]
5996; SSE-NEXT:    movaps %xmm0, %xmm1
5997; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1]
5998; SSE-NEXT:    movaps %xmm2, %xmm4
5999; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
6000; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3]
6001; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6002; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0]
6003; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6004; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3]
6005; SSE-NEXT:    unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
6006; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2]
6007; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6008; SSE-NEXT:    movaps 64(%rdx), %xmm6
6009; SSE-NEXT:    movaps 64(%rcx), %xmm0
6010; SSE-NEXT:    movaps %xmm6, %xmm5
6011; SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
6012; SSE-NEXT:    movaps 64(%rdi), %xmm7
6013; SSE-NEXT:    movaps 64(%rsi), %xmm1
6014; SSE-NEXT:    movaps %xmm7, %xmm4
6015; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
6016; SSE-NEXT:    movaps 64(%r8), %xmm2
6017; SSE-NEXT:    movaps 64(%r9), %xmm3
6018; SSE-NEXT:    movaps %xmm3, %xmm8
6019; SSE-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0]
6020; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3]
6021; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6022; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
6023; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6024; SSE-NEXT:    movaps %xmm2, %xmm4
6025; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1]
6026; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2]
6027; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6028; SSE-NEXT:    unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3]
6029; SSE-NEXT:    movaps %xmm0, %xmm1
6030; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1]
6031; SSE-NEXT:    movaps %xmm2, %xmm4
6032; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
6033; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3]
6034; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6035; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0]
6036; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6037; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3]
6038; SSE-NEXT:    unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
6039; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2]
6040; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6041; SSE-NEXT:    movaps 80(%rdx), %xmm6
6042; SSE-NEXT:    movaps 80(%rcx), %xmm0
6043; SSE-NEXT:    movaps %xmm6, %xmm5
6044; SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
6045; SSE-NEXT:    movaps 80(%rdi), %xmm7
6046; SSE-NEXT:    movaps 80(%rsi), %xmm1
6047; SSE-NEXT:    movaps %xmm7, %xmm4
6048; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
6049; SSE-NEXT:    movaps 80(%r8), %xmm2
6050; SSE-NEXT:    movaps 80(%r9), %xmm3
6051; SSE-NEXT:    movaps %xmm3, %xmm8
6052; SSE-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0]
6053; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3]
6054; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6055; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
6056; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6057; SSE-NEXT:    movaps %xmm2, %xmm4
6058; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1]
6059; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2]
6060; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6061; SSE-NEXT:    unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3]
6062; SSE-NEXT:    movaps %xmm0, %xmm1
6063; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1]
6064; SSE-NEXT:    movaps %xmm2, %xmm4
6065; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
6066; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3]
6067; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6068; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0]
6069; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6070; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3]
6071; SSE-NEXT:    unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
6072; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2]
6073; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6074; SSE-NEXT:    movaps 96(%rdx), %xmm6
6075; SSE-NEXT:    movaps 96(%rcx), %xmm0
6076; SSE-NEXT:    movaps %xmm6, %xmm5
6077; SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
6078; SSE-NEXT:    movaps 96(%rdi), %xmm7
6079; SSE-NEXT:    movaps 96(%rsi), %xmm1
6080; SSE-NEXT:    movaps %xmm7, %xmm4
6081; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
6082; SSE-NEXT:    movaps 96(%r8), %xmm2
6083; SSE-NEXT:    movaps 96(%r9), %xmm3
6084; SSE-NEXT:    movaps %xmm3, %xmm8
6085; SSE-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0]
6086; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3]
6087; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6088; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
6089; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6090; SSE-NEXT:    movaps %xmm2, %xmm4
6091; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1]
6092; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2]
6093; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6094; SSE-NEXT:    unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3]
6095; SSE-NEXT:    movaps %xmm0, %xmm1
6096; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1]
6097; SSE-NEXT:    movaps %xmm2, %xmm4
6098; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
6099; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3]
6100; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6101; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0]
6102; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6103; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3]
6104; SSE-NEXT:    unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
6105; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2]
6106; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6107; SSE-NEXT:    movaps 112(%rdx), %xmm6
6108; SSE-NEXT:    movaps 112(%rcx), %xmm0
6109; SSE-NEXT:    movaps %xmm6, %xmm5
6110; SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
6111; SSE-NEXT:    movaps 112(%rdi), %xmm7
6112; SSE-NEXT:    movaps 112(%rsi), %xmm1
6113; SSE-NEXT:    movaps %xmm7, %xmm4
6114; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
6115; SSE-NEXT:    movaps 112(%r8), %xmm2
6116; SSE-NEXT:    movaps 112(%r9), %xmm3
6117; SSE-NEXT:    movaps %xmm3, %xmm8
6118; SSE-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0]
6119; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3]
6120; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6121; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
6122; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6123; SSE-NEXT:    movaps %xmm2, %xmm4
6124; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1]
6125; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2]
6126; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6127; SSE-NEXT:    unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3]
6128; SSE-NEXT:    movaps %xmm0, %xmm1
6129; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1]
6130; SSE-NEXT:    movaps %xmm2, %xmm4
6131; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
6132; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3]
6133; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6134; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0]
6135; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6136; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3]
6137; SSE-NEXT:    unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
6138; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2]
6139; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6140; SSE-NEXT:    movaps 128(%rdx), %xmm6
6141; SSE-NEXT:    movaps 128(%rcx), %xmm0
6142; SSE-NEXT:    movaps %xmm6, %xmm5
6143; SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
6144; SSE-NEXT:    movaps 128(%rdi), %xmm7
6145; SSE-NEXT:    movaps 128(%rsi), %xmm1
6146; SSE-NEXT:    movaps %xmm7, %xmm4
6147; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
6148; SSE-NEXT:    movaps 128(%r8), %xmm2
6149; SSE-NEXT:    movaps 128(%r9), %xmm3
6150; SSE-NEXT:    movaps %xmm3, %xmm8
6151; SSE-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0]
6152; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3]
6153; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6154; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
6155; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6156; SSE-NEXT:    movaps %xmm2, %xmm4
6157; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1]
6158; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2]
6159; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6160; SSE-NEXT:    unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3]
6161; SSE-NEXT:    movaps %xmm0, %xmm1
6162; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1]
6163; SSE-NEXT:    movaps %xmm2, %xmm4
6164; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
6165; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3]
6166; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6167; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0]
6168; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6169; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3]
6170; SSE-NEXT:    unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
6171; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2]
6172; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6173; SSE-NEXT:    movaps 144(%rdx), %xmm6
6174; SSE-NEXT:    movaps 144(%rcx), %xmm0
6175; SSE-NEXT:    movaps %xmm6, %xmm5
6176; SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
6177; SSE-NEXT:    movaps 144(%rdi), %xmm7
6178; SSE-NEXT:    movaps 144(%rsi), %xmm1
6179; SSE-NEXT:    movaps %xmm7, %xmm4
6180; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
6181; SSE-NEXT:    movaps 144(%r8), %xmm2
6182; SSE-NEXT:    movaps 144(%r9), %xmm3
6183; SSE-NEXT:    movaps %xmm3, %xmm8
6184; SSE-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0]
6185; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3]
6186; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6187; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
6188; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6189; SSE-NEXT:    movaps %xmm2, %xmm4
6190; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1]
6191; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2]
6192; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6193; SSE-NEXT:    unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3]
6194; SSE-NEXT:    movaps %xmm0, %xmm1
6195; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1]
6196; SSE-NEXT:    movaps %xmm2, %xmm4
6197; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
6198; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3]
6199; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6200; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0]
6201; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6202; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3]
6203; SSE-NEXT:    unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
6204; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2]
6205; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6206; SSE-NEXT:    movaps 160(%rdx), %xmm6
6207; SSE-NEXT:    movaps 160(%rcx), %xmm0
6208; SSE-NEXT:    movaps %xmm6, %xmm5
6209; SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
6210; SSE-NEXT:    movaps 160(%rdi), %xmm7
6211; SSE-NEXT:    movaps 160(%rsi), %xmm1
6212; SSE-NEXT:    movaps %xmm7, %xmm4
6213; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
6214; SSE-NEXT:    movaps 160(%r8), %xmm2
6215; SSE-NEXT:    movaps 160(%r9), %xmm3
6216; SSE-NEXT:    movaps %xmm3, %xmm8
6217; SSE-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0]
6218; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3]
6219; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6220; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
6221; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6222; SSE-NEXT:    movaps %xmm2, %xmm4
6223; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1]
6224; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2]
6225; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6226; SSE-NEXT:    unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3]
6227; SSE-NEXT:    movaps %xmm0, %xmm1
6228; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1]
6229; SSE-NEXT:    movaps %xmm2, %xmm4
6230; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
6231; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3]
6232; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6233; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0]
6234; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6235; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3]
6236; SSE-NEXT:    unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
6237; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2]
6238; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6239; SSE-NEXT:    movaps 176(%rdx), %xmm6
6240; SSE-NEXT:    movaps 176(%rcx), %xmm0
6241; SSE-NEXT:    movaps %xmm6, %xmm5
6242; SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
6243; SSE-NEXT:    movaps 176(%rdi), %xmm7
6244; SSE-NEXT:    movaps 176(%rsi), %xmm1
6245; SSE-NEXT:    movaps %xmm7, %xmm4
6246; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
6247; SSE-NEXT:    movaps 176(%r8), %xmm2
6248; SSE-NEXT:    movaps 176(%r9), %xmm3
6249; SSE-NEXT:    movaps %xmm3, %xmm8
6250; SSE-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0]
6251; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3]
6252; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6253; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
6254; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6255; SSE-NEXT:    movaps %xmm2, %xmm4
6256; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1]
6257; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2]
6258; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6259; SSE-NEXT:    unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3]
6260; SSE-NEXT:    movaps %xmm0, %xmm1
6261; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1]
6262; SSE-NEXT:    movaps %xmm2, %xmm4
6263; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
6264; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3]
6265; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6266; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0]
6267; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6268; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3]
6269; SSE-NEXT:    unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
6270; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2]
6271; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6272; SSE-NEXT:    movaps 192(%rdx), %xmm6
6273; SSE-NEXT:    movaps 192(%rcx), %xmm0
6274; SSE-NEXT:    movaps %xmm6, %xmm5
6275; SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
6276; SSE-NEXT:    movaps 192(%rdi), %xmm7
6277; SSE-NEXT:    movaps 192(%rsi), %xmm1
6278; SSE-NEXT:    movaps %xmm7, %xmm4
6279; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
6280; SSE-NEXT:    movaps 192(%r8), %xmm2
6281; SSE-NEXT:    movaps 192(%r9), %xmm3
6282; SSE-NEXT:    movaps %xmm3, %xmm8
6283; SSE-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0]
6284; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3]
6285; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6286; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
6287; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6288; SSE-NEXT:    movaps %xmm2, %xmm4
6289; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1]
6290; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2]
6291; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6292; SSE-NEXT:    unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3]
6293; SSE-NEXT:    movaps %xmm0, %xmm1
6294; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1]
6295; SSE-NEXT:    movaps %xmm2, %xmm4
6296; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
6297; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3]
6298; SSE-NEXT:    movaps %xmm4, (%rsp) # 16-byte Spill
6299; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0]
6300; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6301; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3]
6302; SSE-NEXT:    unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
6303; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2]
6304; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6305; SSE-NEXT:    movaps 208(%rdx), %xmm6
6306; SSE-NEXT:    movaps 208(%rcx), %xmm0
6307; SSE-NEXT:    movaps %xmm6, %xmm5
6308; SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
6309; SSE-NEXT:    movaps 208(%rdi), %xmm7
6310; SSE-NEXT:    movaps 208(%rsi), %xmm1
6311; SSE-NEXT:    movaps %xmm7, %xmm4
6312; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
6313; SSE-NEXT:    movaps 208(%r8), %xmm2
6314; SSE-NEXT:    movaps 208(%r9), %xmm3
6315; SSE-NEXT:    movaps %xmm3, %xmm8
6316; SSE-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0]
6317; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3]
6318; SSE-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6319; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
6320; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6321; SSE-NEXT:    movaps %xmm2, %xmm4
6322; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1]
6323; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2]
6324; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6325; SSE-NEXT:    unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3]
6326; SSE-NEXT:    movaps %xmm0, %xmm1
6327; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1]
6328; SSE-NEXT:    movaps %xmm2, %xmm4
6329; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
6330; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3]
6331; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6332; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0]
6333; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6334; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3]
6335; SSE-NEXT:    unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
6336; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2]
6337; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6338; SSE-NEXT:    movaps 224(%rdx), %xmm9
6339; SSE-NEXT:    movaps 224(%rcx), %xmm0
6340; SSE-NEXT:    movaps %xmm9, %xmm14
6341; SSE-NEXT:    unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
6342; SSE-NEXT:    movaps 224(%rdi), %xmm11
6343; SSE-NEXT:    movaps 224(%rsi), %xmm1
6344; SSE-NEXT:    movaps %xmm11, %xmm13
6345; SSE-NEXT:    unpcklps {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
6346; SSE-NEXT:    movaps 224(%r8), %xmm2
6347; SSE-NEXT:    movaps 224(%r9), %xmm3
6348; SSE-NEXT:    movaps %xmm3, %xmm15
6349; SSE-NEXT:    movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0]
6350; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[2,0],xmm13[2,3]
6351; SSE-NEXT:    movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0]
6352; SSE-NEXT:    movaps %xmm2, %xmm4
6353; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1]
6354; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[2,3],xmm4[0,2]
6355; SSE-NEXT:    unpckhps {{.*#+}} xmm11 = xmm11[2],xmm1[2],xmm11[3],xmm1[3]
6356; SSE-NEXT:    movaps %xmm0, %xmm1
6357; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1]
6358; SSE-NEXT:    movaps %xmm2, %xmm8
6359; SSE-NEXT:    unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1]
6360; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,2],xmm11[2,3]
6361; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2,0]
6362; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3]
6363; SSE-NEXT:    unpckhps {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3]
6364; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[2,3],xmm2[0,2]
6365; SSE-NEXT:    movaps 240(%rdx), %xmm3
6366; SSE-NEXT:    movaps 240(%rcx), %xmm12
6367; SSE-NEXT:    movaps %xmm3, %xmm5
6368; SSE-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1]
6369; SSE-NEXT:    movaps 240(%rdi), %xmm2
6370; SSE-NEXT:    movaps 240(%rsi), %xmm10
6371; SSE-NEXT:    movaps %xmm2, %xmm4
6372; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1]
6373; SSE-NEXT:    movaps 240(%r8), %xmm1
6374; SSE-NEXT:    movaps 240(%r9), %xmm7
6375; SSE-NEXT:    movaps %xmm7, %xmm6
6376; SSE-NEXT:    movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0]
6377; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3]
6378; SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
6379; SSE-NEXT:    movaps %xmm1, %xmm0
6380; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[1,1]
6381; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2]
6382; SSE-NEXT:    unpckhps {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3]
6383; SSE-NEXT:    movaps %xmm12, %xmm0
6384; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
6385; SSE-NEXT:    movaps %xmm1, %xmm10
6386; SSE-NEXT:    unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1]
6387; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[0,2],xmm2[2,3]
6388; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
6389; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3]
6390; SSE-NEXT:    unpckhps {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3]
6391; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[0,2]
6392; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
6393; SSE-NEXT:    movaps %xmm3, 1520(%rax)
6394; SSE-NEXT:    movaps %xmm10, 1504(%rax)
6395; SSE-NEXT:    movaps %xmm2, 1488(%rax)
6396; SSE-NEXT:    movaps %xmm5, 1472(%rax)
6397; SSE-NEXT:    movaps %xmm6, 1456(%rax)
6398; SSE-NEXT:    movaps %xmm4, 1440(%rax)
6399; SSE-NEXT:    movaps %xmm9, 1424(%rax)
6400; SSE-NEXT:    movaps %xmm8, 1408(%rax)
6401; SSE-NEXT:    movaps %xmm11, 1392(%rax)
6402; SSE-NEXT:    movaps %xmm14, 1376(%rax)
6403; SSE-NEXT:    movaps %xmm15, 1360(%rax)
6404; SSE-NEXT:    movaps %xmm13, 1344(%rax)
6405; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6406; SSE-NEXT:    movaps %xmm0, 1328(%rax)
6407; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6408; SSE-NEXT:    movaps %xmm0, 1312(%rax)
6409; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6410; SSE-NEXT:    movaps %xmm0, 1296(%rax)
6411; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6412; SSE-NEXT:    movaps %xmm0, 1280(%rax)
6413; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6414; SSE-NEXT:    movaps %xmm0, 1264(%rax)
6415; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6416; SSE-NEXT:    movaps %xmm0, 1248(%rax)
6417; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6418; SSE-NEXT:    movaps %xmm0, 1232(%rax)
6419; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
6420; SSE-NEXT:    movaps %xmm0, 1216(%rax)
6421; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6422; SSE-NEXT:    movaps %xmm0, 1200(%rax)
6423; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6424; SSE-NEXT:    movaps %xmm0, 1184(%rax)
6425; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6426; SSE-NEXT:    movaps %xmm0, 1168(%rax)
6427; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6428; SSE-NEXT:    movaps %xmm0, 1152(%rax)
6429; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6430; SSE-NEXT:    movaps %xmm0, 1136(%rax)
6431; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6432; SSE-NEXT:    movaps %xmm0, 1120(%rax)
6433; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6434; SSE-NEXT:    movaps %xmm0, 1104(%rax)
6435; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6436; SSE-NEXT:    movaps %xmm0, 1088(%rax)
6437; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6438; SSE-NEXT:    movaps %xmm0, 1072(%rax)
6439; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6440; SSE-NEXT:    movaps %xmm0, 1056(%rax)
6441; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6442; SSE-NEXT:    movaps %xmm0, 1040(%rax)
6443; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6444; SSE-NEXT:    movaps %xmm0, 1024(%rax)
6445; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6446; SSE-NEXT:    movaps %xmm0, 1008(%rax)
6447; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6448; SSE-NEXT:    movaps %xmm0, 992(%rax)
6449; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6450; SSE-NEXT:    movaps %xmm0, 976(%rax)
6451; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6452; SSE-NEXT:    movaps %xmm0, 960(%rax)
6453; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6454; SSE-NEXT:    movaps %xmm0, 944(%rax)
6455; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6456; SSE-NEXT:    movaps %xmm0, 928(%rax)
6457; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6458; SSE-NEXT:    movaps %xmm0, 912(%rax)
6459; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6460; SSE-NEXT:    movaps %xmm0, 896(%rax)
6461; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6462; SSE-NEXT:    movaps %xmm0, 880(%rax)
6463; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6464; SSE-NEXT:    movaps %xmm0, 864(%rax)
6465; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6466; SSE-NEXT:    movaps %xmm0, 848(%rax)
6467; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6468; SSE-NEXT:    movaps %xmm0, 832(%rax)
6469; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6470; SSE-NEXT:    movaps %xmm0, 816(%rax)
6471; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6472; SSE-NEXT:    movaps %xmm0, 800(%rax)
6473; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6474; SSE-NEXT:    movaps %xmm0, 784(%rax)
6475; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6476; SSE-NEXT:    movaps %xmm0, 768(%rax)
6477; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6478; SSE-NEXT:    movaps %xmm0, 752(%rax)
6479; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6480; SSE-NEXT:    movaps %xmm0, 736(%rax)
6481; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6482; SSE-NEXT:    movaps %xmm0, 720(%rax)
6483; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6484; SSE-NEXT:    movaps %xmm0, 704(%rax)
6485; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6486; SSE-NEXT:    movaps %xmm0, 688(%rax)
6487; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6488; SSE-NEXT:    movaps %xmm0, 672(%rax)
6489; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6490; SSE-NEXT:    movaps %xmm0, 656(%rax)
6491; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6492; SSE-NEXT:    movaps %xmm0, 640(%rax)
6493; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6494; SSE-NEXT:    movaps %xmm0, 624(%rax)
6495; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6496; SSE-NEXT:    movaps %xmm0, 608(%rax)
6497; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6498; SSE-NEXT:    movaps %xmm0, 592(%rax)
6499; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6500; SSE-NEXT:    movaps %xmm0, 576(%rax)
6501; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6502; SSE-NEXT:    movaps %xmm0, 560(%rax)
6503; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6504; SSE-NEXT:    movaps %xmm0, 544(%rax)
6505; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6506; SSE-NEXT:    movaps %xmm0, 528(%rax)
6507; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6508; SSE-NEXT:    movaps %xmm0, 512(%rax)
6509; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6510; SSE-NEXT:    movaps %xmm0, 496(%rax)
6511; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6512; SSE-NEXT:    movaps %xmm0, 480(%rax)
6513; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6514; SSE-NEXT:    movaps %xmm0, 464(%rax)
6515; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6516; SSE-NEXT:    movaps %xmm0, 448(%rax)
6517; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6518; SSE-NEXT:    movaps %xmm0, 432(%rax)
6519; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6520; SSE-NEXT:    movaps %xmm0, 416(%rax)
6521; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6522; SSE-NEXT:    movaps %xmm0, 400(%rax)
6523; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6524; SSE-NEXT:    movaps %xmm0, 384(%rax)
6525; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6526; SSE-NEXT:    movaps %xmm0, 368(%rax)
6527; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6528; SSE-NEXT:    movaps %xmm0, 352(%rax)
6529; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6530; SSE-NEXT:    movaps %xmm0, 336(%rax)
6531; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6532; SSE-NEXT:    movaps %xmm0, 320(%rax)
6533; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6534; SSE-NEXT:    movaps %xmm0, 304(%rax)
6535; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6536; SSE-NEXT:    movaps %xmm0, 288(%rax)
6537; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6538; SSE-NEXT:    movaps %xmm0, 272(%rax)
6539; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6540; SSE-NEXT:    movaps %xmm0, 256(%rax)
6541; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6542; SSE-NEXT:    movaps %xmm0, 240(%rax)
6543; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6544; SSE-NEXT:    movaps %xmm0, 224(%rax)
6545; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6546; SSE-NEXT:    movaps %xmm0, 208(%rax)
6547; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6548; SSE-NEXT:    movaps %xmm0, 192(%rax)
6549; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6550; SSE-NEXT:    movaps %xmm0, 176(%rax)
6551; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6552; SSE-NEXT:    movaps %xmm0, 160(%rax)
6553; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6554; SSE-NEXT:    movaps %xmm0, 144(%rax)
6555; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6556; SSE-NEXT:    movaps %xmm0, 128(%rax)
6557; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6558; SSE-NEXT:    movaps %xmm0, 112(%rax)
6559; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6560; SSE-NEXT:    movaps %xmm0, 96(%rax)
6561; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6562; SSE-NEXT:    movaps %xmm0, 80(%rax)
6563; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6564; SSE-NEXT:    movaps %xmm0, 64(%rax)
6565; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6566; SSE-NEXT:    movaps %xmm0, 48(%rax)
6567; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6568; SSE-NEXT:    movaps %xmm0, 32(%rax)
6569; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6570; SSE-NEXT:    movaps %xmm0, 16(%rax)
6571; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6572; SSE-NEXT:    movaps %xmm0, (%rax)
6573; SSE-NEXT:    addq $1224, %rsp # imm = 0x4C8
6574; SSE-NEXT:    retq
6575;
6576; AVX-LABEL: store_i32_stride6_vf64:
6577; AVX:       # %bb.0:
6578; AVX-NEXT:    subq $2504, %rsp # imm = 0x9C8
6579; AVX-NEXT:    vmovaps (%rdi), %ymm8
6580; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6581; AVX-NEXT:    vmovaps (%rsi), %ymm9
6582; AVX-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6583; AVX-NEXT:    vmovaps (%rdx), %ymm4
6584; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6585; AVX-NEXT:    vmovaps (%rcx), %ymm15
6586; AVX-NEXT:    vmovaps (%r8), %ymm6
6587; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6588; AVX-NEXT:    vmovaps (%rcx), %xmm1
6589; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6590; AVX-NEXT:    vmovaps 32(%rcx), %xmm2
6591; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6592; AVX-NEXT:    vmovaps (%rdx), %xmm0
6593; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6594; AVX-NEXT:    vmovaps 32(%rdx), %xmm3
6595; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6596; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2]
6597; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
6598; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
6599; AVX-NEXT:    vmovaps (%rsi), %xmm1
6600; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6601; AVX-NEXT:    vmovaps (%rdi), %xmm7
6602; AVX-NEXT:    vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6603; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm1[2],xmm7[3],xmm1[3]
6604; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6605; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6606; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
6607; AVX-NEXT:    vbroadcastss 4(%r8), %xmm1
6608; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
6609; AVX-NEXT:    vbroadcastss 4(%r9), %ymm1
6610; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
6611; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6612; AVX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5]
6613; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
6614; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm4[0],ymm15[2],ymm4[2]
6615; AVX-NEXT:    vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6616; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
6617; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
6618; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
6619; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7]
6620; AVX-NEXT:    vbroadcastss 16(%r9), %ymm1
6621; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
6622; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6623; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm3[1,2],xmm2[1,2]
6624; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
6625; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
6626; AVX-NEXT:    vmovaps 32(%rsi), %xmm1
6627; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6628; AVX-NEXT:    vmovaps 32(%rdi), %xmm2
6629; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6630; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6631; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6632; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6633; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
6634; AVX-NEXT:    vbroadcastss 36(%r8), %xmm1
6635; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
6636; AVX-NEXT:    vbroadcastss 36(%r9), %ymm1
6637; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
6638; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6639; AVX-NEXT:    vmovaps 32(%rdi), %ymm1
6640; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6641; AVX-NEXT:    vmovaps 32(%rsi), %ymm0
6642; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6643; AVX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
6644; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
6645; AVX-NEXT:    vmovaps 32(%rdx), %ymm2
6646; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6647; AVX-NEXT:    vmovaps 32(%rcx), %ymm1
6648; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6649; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
6650; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
6651; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
6652; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
6653; AVX-NEXT:    vmovaps 32(%r8), %ymm1
6654; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6655; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
6656; AVX-NEXT:    vbroadcastss 48(%r9), %ymm1
6657; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
6658; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6659; AVX-NEXT:    vmovaps 64(%rcx), %xmm1
6660; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6661; AVX-NEXT:    vmovaps 64(%rdx), %xmm0
6662; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6663; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2]
6664; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
6665; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
6666; AVX-NEXT:    vmovaps 64(%rsi), %xmm1
6667; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6668; AVX-NEXT:    vmovaps 64(%rdi), %xmm2
6669; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6670; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6671; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6672; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6673; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
6674; AVX-NEXT:    vbroadcastss 68(%r8), %xmm1
6675; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
6676; AVX-NEXT:    vbroadcastss 68(%r9), %ymm1
6677; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
6678; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6679; AVX-NEXT:    vmovaps 64(%rdi), %ymm1
6680; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6681; AVX-NEXT:    vmovaps 64(%rsi), %ymm0
6682; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6683; AVX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
6684; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
6685; AVX-NEXT:    vmovaps 64(%rdx), %ymm2
6686; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6687; AVX-NEXT:    vmovaps 64(%rcx), %ymm1
6688; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6689; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
6690; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
6691; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
6692; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
6693; AVX-NEXT:    vmovaps 64(%r8), %ymm1
6694; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6695; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
6696; AVX-NEXT:    vbroadcastss 80(%r9), %ymm1
6697; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
6698; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6699; AVX-NEXT:    vmovaps 96(%rcx), %xmm1
6700; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6701; AVX-NEXT:    vmovaps 96(%rdx), %xmm0
6702; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6703; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2]
6704; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
6705; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
6706; AVX-NEXT:    vmovaps 96(%rsi), %xmm1
6707; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6708; AVX-NEXT:    vmovaps 96(%rdi), %xmm2
6709; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6710; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6711; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6712; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6713; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
6714; AVX-NEXT:    vbroadcastss 100(%r8), %xmm1
6715; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
6716; AVX-NEXT:    vbroadcastss 100(%r9), %ymm1
6717; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
6718; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6719; AVX-NEXT:    vmovaps 96(%rdi), %ymm1
6720; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6721; AVX-NEXT:    vmovaps 96(%rsi), %ymm0
6722; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6723; AVX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
6724; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
6725; AVX-NEXT:    vmovaps 96(%rdx), %ymm2
6726; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6727; AVX-NEXT:    vmovaps 96(%rcx), %ymm1
6728; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6729; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
6730; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
6731; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
6732; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
6733; AVX-NEXT:    vmovaps 96(%r8), %ymm1
6734; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6735; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
6736; AVX-NEXT:    vbroadcastss 112(%r9), %ymm1
6737; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
6738; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6739; AVX-NEXT:    vmovaps 128(%rcx), %xmm1
6740; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6741; AVX-NEXT:    vmovaps 128(%rdx), %xmm0
6742; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6743; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2]
6744; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
6745; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
6746; AVX-NEXT:    vmovaps 128(%rsi), %xmm1
6747; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6748; AVX-NEXT:    vmovaps 128(%rdi), %xmm2
6749; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6750; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6751; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6752; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6753; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
6754; AVX-NEXT:    vbroadcastss 132(%r8), %xmm1
6755; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
6756; AVX-NEXT:    vbroadcastss 132(%r9), %ymm1
6757; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
6758; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6759; AVX-NEXT:    vmovaps 128(%rdi), %ymm0
6760; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6761; AVX-NEXT:    vmovaps 128(%rsi), %ymm11
6762; AVX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[4],ymm11[4],ymm0[5],ymm11[5]
6763; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
6764; AVX-NEXT:    vmovaps 128(%rdx), %ymm1
6765; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6766; AVX-NEXT:    vmovaps 128(%rcx), %ymm2
6767; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6768; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
6769; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
6770; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
6771; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
6772; AVX-NEXT:    vmovaps 128(%r8), %ymm1
6773; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6774; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
6775; AVX-NEXT:    vbroadcastss 144(%r9), %ymm1
6776; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
6777; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6778; AVX-NEXT:    vmovaps 160(%rcx), %xmm0
6779; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6780; AVX-NEXT:    vmovaps 160(%rdx), %xmm1
6781; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6782; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[1,2],xmm0[1,2]
6783; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
6784; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
6785; AVX-NEXT:    vmovaps 160(%rsi), %xmm7
6786; AVX-NEXT:    vmovaps 160(%rdi), %xmm6
6787; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
6788; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6789; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6790; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
6791; AVX-NEXT:    vbroadcastss 164(%r8), %xmm1
6792; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
6793; AVX-NEXT:    vbroadcastss 164(%r9), %ymm1
6794; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
6795; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6796; AVX-NEXT:    vmovaps 160(%rdi), %ymm1
6797; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6798; AVX-NEXT:    vmovaps 160(%rsi), %ymm0
6799; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6800; AVX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
6801; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
6802; AVX-NEXT:    vmovaps 160(%rdx), %ymm1
6803; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6804; AVX-NEXT:    vmovaps 160(%rcx), %ymm8
6805; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[2],ymm1[2]
6806; AVX-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6807; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
6808; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
6809; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
6810; AVX-NEXT:    vmovaps 160(%r8), %ymm1
6811; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6812; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
6813; AVX-NEXT:    vbroadcastss 176(%r9), %ymm1
6814; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
6815; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6816; AVX-NEXT:    vmovaps 192(%rcx), %xmm1
6817; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6818; AVX-NEXT:    vmovaps 192(%rdx), %xmm0
6819; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6820; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2]
6821; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
6822; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
6823; AVX-NEXT:    vmovaps 192(%rsi), %xmm3
6824; AVX-NEXT:    vmovaps 192(%rdi), %xmm2
6825; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
6826; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6827; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6828; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
6829; AVX-NEXT:    vbroadcastss 196(%r8), %xmm1
6830; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
6831; AVX-NEXT:    vbroadcastss 196(%r9), %ymm1
6832; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
6833; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6834; AVX-NEXT:    vmovaps 192(%rdi), %ymm1
6835; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6836; AVX-NEXT:    vmovaps 192(%rsi), %ymm0
6837; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6838; AVX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
6839; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
6840; AVX-NEXT:    vmovaps 192(%rdx), %ymm4
6841; AVX-NEXT:    vmovaps 192(%rcx), %ymm1
6842; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6843; AVX-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
6844; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6845; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
6846; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
6847; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
6848; AVX-NEXT:    vmovaps 192(%r8), %ymm1
6849; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6850; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
6851; AVX-NEXT:    vbroadcastss 208(%r9), %ymm1
6852; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
6853; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6854; AVX-NEXT:    vmovaps 224(%rcx), %xmm1
6855; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6856; AVX-NEXT:    vmovaps 224(%rdx), %xmm0
6857; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6858; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2]
6859; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
6860; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
6861; AVX-NEXT:    vmovaps 224(%rsi), %xmm1
6862; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6863; AVX-NEXT:    vmovaps 224(%rdi), %xmm5
6864; AVX-NEXT:    vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6865; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
6866; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6867; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
6868; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
6869; AVX-NEXT:    vbroadcastss 228(%r8), %xmm1
6870; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
6871; AVX-NEXT:    vbroadcastss 228(%r9), %ymm1
6872; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
6873; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6874; AVX-NEXT:    vmovaps 224(%rdi), %ymm9
6875; AVX-NEXT:    vmovaps 224(%rsi), %ymm0
6876; AVX-NEXT:    vunpcklps {{.*#+}} ymm14 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5]
6877; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3]
6878; AVX-NEXT:    vmovaps 224(%rdx), %ymm5
6879; AVX-NEXT:    vmovaps 224(%rcx), %ymm1
6880; AVX-NEXT:    vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm5[0],ymm1[2],ymm5[2]
6881; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6882; AVX-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6883; AVX-NEXT:    vextractf128 $1, %ymm13, %xmm13
6884; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0]
6885; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5,6,7]
6886; AVX-NEXT:    vmovaps 224(%r8), %ymm10
6887; AVX-NEXT:    vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6888; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm10[4,5],ymm13[6,7]
6889; AVX-NEXT:    vbroadcastss 240(%r9), %ymm14
6890; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7]
6891; AVX-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6892; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
6893; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload
6894; AVX-NEXT:    # ymm14 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7]
6895; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
6896; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm10[1,2],ymm15[1,2],ymm10[5,6],ymm15[5,6]
6897; AVX-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,2,3]
6898; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[0,2,1,3,4,6,5,7]
6899; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7]
6900; AVX-NEXT:    vbroadcastss 20(%r8), %xmm15
6901; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7]
6902; AVX-NEXT:    vbroadcastss 20(%r9), %ymm15
6903; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7]
6904; AVX-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6905; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
6906; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload
6907; AVX-NEXT:    # ymm13 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7]
6908; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6909; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
6910; AVX-NEXT:    vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm15 # 32-byte Folded Reload
6911; AVX-NEXT:    # ymm15 = ymm12[1,2],mem[1,2],ymm12[5,6],mem[5,6]
6912; AVX-NEXT:    vperm2f128 {{.*#+}} ymm15 = ymm15[2,3,2,3]
6913; AVX-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[0,2,1,3,4,6,5,7]
6914; AVX-NEXT:    vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm13[4,5],ymm15[6,7]
6915; AVX-NEXT:    vbroadcastss 52(%r8), %xmm12
6916; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5,6,7]
6917; AVX-NEXT:    vbroadcastss 52(%r9), %ymm15
6918; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7]
6919; AVX-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6920; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
6921; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload
6922; AVX-NEXT:    # ymm13 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7]
6923; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6924; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
6925; AVX-NEXT:    vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
6926; AVX-NEXT:    # ymm12 = ymm12[1,2],mem[1,2],ymm12[5,6],mem[5,6]
6927; AVX-NEXT:    vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,2,3]
6928; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm12[0,2,1,3,4,6,5,7]
6929; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7]
6930; AVX-NEXT:    vbroadcastss 84(%r8), %xmm15
6931; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7]
6932; AVX-NEXT:    vbroadcastss 84(%r9), %ymm15
6933; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7]
6934; AVX-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6935; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
6936; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
6937; AVX-NEXT:    # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1]
6938; AVX-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
6939; AVX-NEXT:    # xmm15 = mem[0,0,0,0]
6940; AVX-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
6941; AVX-NEXT:    # xmm13 = mem[0,0,0,0]
6942; AVX-NEXT:    vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1]
6943; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm15
6944; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7]
6945; AVX-NEXT:    vinsertf128 $1, 96(%r8), %ymm12, %ymm12
6946; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7]
6947; AVX-NEXT:    vbroadcastss 96(%r9), %ymm13
6948; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7]
6949; AVX-NEXT:    vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6950; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
6951; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
6952; AVX-NEXT:    # ymm12 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7]
6953; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
6954; AVX-NEXT:    vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
6955; AVX-NEXT:    # ymm13 = ymm13[1,2],mem[1,2],ymm13[5,6],mem[5,6]
6956; AVX-NEXT:    vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,2,3]
6957; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[0,2,1,3,4,6,5,7]
6958; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm12[4,5],ymm13[6,7]
6959; AVX-NEXT:    vbroadcastss 116(%r8), %xmm15
6960; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7]
6961; AVX-NEXT:    vbroadcastss 116(%r9), %ymm15
6962; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7]
6963; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6964; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
6965; AVX-NEXT:    vunpckhps {{.*#+}} ymm13 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7]
6966; AVX-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6967; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
6968; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
6969; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[1,2],ymm15[1,2],ymm11[5,6],ymm15[5,6]
6970; AVX-NEXT:    vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,2,3]
6971; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[0,2,1,3,4,6,5,7]
6972; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7]
6973; AVX-NEXT:    vbroadcastss 148(%r8), %xmm13
6974; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7]
6975; AVX-NEXT:    vbroadcastss 148(%r9), %ymm13
6976; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3],ymm11[4,5,6,7]
6977; AVX-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6978; AVX-NEXT:    vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
6979; AVX-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
6980; AVX-NEXT:    # xmm7 = mem[0,0,0,0]
6981; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
6982; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm13[0,0,0,0]
6983; AVX-NEXT:    vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1]
6984; AVX-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm11
6985; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3],ymm11[4,5,6,7]
6986; AVX-NEXT:    vinsertf128 $1, 160(%r8), %ymm6, %ymm6
6987; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7]
6988; AVX-NEXT:    vbroadcastss 160(%r9), %ymm7
6989; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
6990; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6991; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6992; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
6993; AVX-NEXT:    # ymm7 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7]
6994; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6995; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm8[1,2],ymm6[5,6],ymm8[5,6]
6996; AVX-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3]
6997; AVX-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[0,2,1,3,4,6,5,7]
6998; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7]
6999; AVX-NEXT:    vbroadcastss 180(%r8), %xmm8
7000; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5,6,7]
7001; AVX-NEXT:    vbroadcastss 180(%r9), %ymm8
7002; AVX-NEXT:    vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5,6,7]
7003; AVX-NEXT:    vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7004; AVX-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
7005; AVX-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7006; AVX-NEXT:    # xmm3 = mem[0,0,0,0]
7007; AVX-NEXT:    vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
7008; AVX-NEXT:    # xmm6 = mem[0,0,0,0]
7009; AVX-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
7010; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm6
7011; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7]
7012; AVX-NEXT:    vinsertf128 $1, 192(%r8), %ymm2, %ymm2
7013; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
7014; AVX-NEXT:    vbroadcastss 192(%r9), %ymm3
7015; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
7016; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7017; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7018; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
7019; AVX-NEXT:    # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
7020; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7021; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm4[1,2],ymm2[1,2],ymm4[5,6],ymm2[5,6]
7022; AVX-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3]
7023; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[0,2,1,3,4,6,5,7]
7024; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7]
7025; AVX-NEXT:    vbroadcastss 212(%r8), %xmm4
7026; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7]
7027; AVX-NEXT:    vbroadcastss 212(%r9), %ymm4
7028; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7]
7029; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7030; AVX-NEXT:    vunpckhps {{.*#+}} ymm3 = ymm9[2],ymm0[2],ymm9[3],ymm0[3],ymm9[6],ymm0[6],ymm9[7],ymm0[7]
7031; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm5[1,2],ymm1[1,2],ymm5[5,6],ymm1[5,6]
7032; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
7033; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
7034; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7]
7035; AVX-NEXT:    vmovaps %ymm3, %ymm5
7036; AVX-NEXT:    vbroadcastss 244(%r8), %xmm1
7037; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
7038; AVX-NEXT:    vbroadcastss 244(%r9), %ymm1
7039; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
7040; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7041; AVX-NEXT:    vbroadcastss (%rcx), %xmm0
7042; AVX-NEXT:    vbroadcastss (%rdx), %xmm1
7043; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
7044; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7045; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
7046; AVX-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
7047; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
7048; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7]
7049; AVX-NEXT:    vinsertf128 $1, (%r8), %ymm1, %ymm1
7050; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
7051; AVX-NEXT:    vbroadcastss (%r9), %ymm1
7052; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
7053; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7054; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7055; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
7056; AVX-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
7057; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
7058; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7059; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
7060; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = mem[2,1,3,3]
7061; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
7062; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
7063; AVX-NEXT:    vmovaps (%r9), %xmm1
7064; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm1[0,2,2,3]
7065; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
7066; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
7067; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7068; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7069; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm10[3,0],ymm0[7,4],ymm10[7,4]
7070; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
7071; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3]
7072; AVX-NEXT:    vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
7073; AVX-NEXT:    # ymm1 = mem[2,3,2,3]
7074; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
7075; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
7076; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
7077; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
7078; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
7079; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7080; AVX-NEXT:    vbroadcastss 32(%rcx), %xmm0
7081; AVX-NEXT:    vbroadcastss 32(%rdx), %xmm1
7082; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
7083; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7084; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
7085; AVX-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
7086; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
7087; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7]
7088; AVX-NEXT:    vinsertf128 $1, 32(%r8), %ymm1, %ymm1
7089; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
7090; AVX-NEXT:    vbroadcastss 32(%r9), %ymm1
7091; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
7092; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7093; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7094; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
7095; AVX-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
7096; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
7097; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7098; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
7099; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = mem[2,1,3,3]
7100; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
7101; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
7102; AVX-NEXT:    vmovaps 32(%r9), %xmm1
7103; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm1[0,2,2,3]
7104; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
7105; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
7106; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7107; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7108; AVX-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7109; AVX-NEXT:    # ymm0 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4]
7110; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
7111; AVX-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7112; AVX-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
7113; AVX-NEXT:    vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
7114; AVX-NEXT:    # ymm1 = mem[2,3,2,3]
7115; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
7116; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
7117; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
7118; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
7119; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
7120; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7121; AVX-NEXT:    vbroadcastss 64(%rcx), %xmm0
7122; AVX-NEXT:    vbroadcastss 64(%rdx), %xmm1
7123; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
7124; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7125; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
7126; AVX-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
7127; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3
7128; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7]
7129; AVX-NEXT:    vinsertf128 $1, 64(%r8), %ymm1, %ymm1
7130; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
7131; AVX-NEXT:    vbroadcastss 64(%r9), %ymm1
7132; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
7133; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7134; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7135; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
7136; AVX-NEXT:    # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
7137; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
7138; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7139; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
7140; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = mem[2,1,3,3]
7141; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm3
7142; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7]
7143; AVX-NEXT:    vmovaps 64(%r9), %xmm3
7144; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm3[0,2,2,3]
7145; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
7146; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6],ymm3[7]
7147; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7148; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7149; AVX-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
7150; AVX-NEXT:    # ymm3 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4]
7151; AVX-NEXT:    vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
7152; AVX-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
7153; AVX-NEXT:    # ymm3 = mem[2,3],ymm3[2,3]
7154; AVX-NEXT:    vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
7155; AVX-NEXT:    # ymm4 = mem[2,3,2,3]
7156; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7]
7157; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7]
7158; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = mem[2,3,2,3]
7159; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
7160; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7]
7161; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7162; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
7163; AVX-NEXT:    # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3]
7164; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3]
7165; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7166; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
7167; AVX-NEXT:    vpermilps {{.*#+}} xmm4 = mem[2,1,3,3]
7168; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm4, %ymm4
7169; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7]
7170; AVX-NEXT:    vmovaps 96(%r9), %xmm4
7171; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm4[0,2,2,3]
7172; AVX-NEXT:    vinsertf128 $1, %xmm4, %ymm8, %ymm4
7173; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7]
7174; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7175; AVX-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
7176; AVX-NEXT:    # ymm4 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4]
7177; AVX-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7]
7178; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3]
7179; AVX-NEXT:    vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
7180; AVX-NEXT:    # ymm8 = mem[2,3,2,3]
7181; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7]
7182; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5],ymm8[6,7]
7183; AVX-NEXT:    vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3]
7184; AVX-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7]
7185; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1],ymm4[2,3,4,5,6],ymm8[7]
7186; AVX-NEXT:    vbroadcastss 128(%rcx), %xmm8
7187; AVX-NEXT:    vbroadcastss 128(%rdx), %xmm9
7188; AVX-NEXT:    vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
7189; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7190; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
7191; AVX-NEXT:    # xmm9 = xmm0[0],mem[0],xmm0[1],mem[1]
7192; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm11
7193; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3],ymm11[4,5,6,7]
7194; AVX-NEXT:    vinsertf128 $1, 128(%r8), %ymm9, %ymm9
7195; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
7196; AVX-NEXT:    vbroadcastss 128(%r9), %ymm9
7197; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7]
7198; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7199; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
7200; AVX-NEXT:    # xmm9 = xmm0[2],mem[2],xmm0[3],mem[3]
7201; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3]
7202; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7203; AVX-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm9
7204; AVX-NEXT:    vpermilps {{.*#+}} xmm11 = mem[2,1,3,3]
7205; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm11, %ymm11
7206; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3,4,5],ymm11[6,7]
7207; AVX-NEXT:    vmovaps 128(%r9), %xmm11
7208; AVX-NEXT:    vshufps {{.*#+}} xmm12 = xmm11[0,2,2,3]
7209; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm12, %ymm11
7210; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2,3,4,5,6],ymm11[7]
7211; AVX-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm11 # 32-byte Folded Reload
7212; AVX-NEXT:    # ymm11 = ymm15[3,0],mem[3,0],ymm15[7,4],mem[7,4]
7213; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7]
7214; AVX-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload
7215; AVX-NEXT:    # ymm10 = mem[2,3],ymm11[2,3]
7216; AVX-NEXT:    vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
7217; AVX-NEXT:    # ymm11 = mem[2,3,2,3]
7218; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[2,1,3,3,6,5,7,7]
7219; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3,4,5],ymm11[6,7]
7220; AVX-NEXT:    vperm2f128 {{.*#+}} ymm11 = mem[2,3,2,3]
7221; AVX-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[0,2,2,3,4,6,6,7]
7222; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4,5,6],ymm11[7]
7223; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm11 # 16-byte Folded Reload
7224; AVX-NEXT:    # xmm11 = xmm13[2],mem[2],xmm13[3],mem[3]
7225; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm11[2,3,2,3]
7226; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7227; AVX-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm11
7228; AVX-NEXT:    vpermilps {{.*#+}} xmm12 = mem[2,1,3,3]
7229; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm12, %ymm12
7230; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5],ymm12[6,7]
7231; AVX-NEXT:    vmovaps 160(%r9), %xmm12
7232; AVX-NEXT:    vshufps {{.*#+}} xmm13 = xmm12[0,2,2,3]
7233; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm13, %ymm12
7234; AVX-NEXT:    vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4,5,6],ymm12[7]
7235; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7236; AVX-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
7237; AVX-NEXT:    # ymm12 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4]
7238; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7]
7239; AVX-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm12[2,3]
7240; AVX-NEXT:    vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
7241; AVX-NEXT:    # ymm12 = mem[2,3,2,3]
7242; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7]
7243; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3,4,5],ymm12[6,7]
7244; AVX-NEXT:    vperm2f128 {{.*#+}} ymm12 = mem[2,3,2,3]
7245; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7]
7246; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4,5,6],ymm12[7]
7247; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7248; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
7249; AVX-NEXT:    # xmm12 = xmm0[2],mem[2],xmm0[3],mem[3]
7250; AVX-NEXT:    vshufps {{.*#+}} xmm12 = xmm12[2,3,2,3]
7251; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7252; AVX-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm12
7253; AVX-NEXT:    vpermilps {{.*#+}} xmm13 = mem[2,1,3,3]
7254; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm13, %ymm13
7255; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7]
7256; AVX-NEXT:    vmovaps 192(%r9), %xmm13
7257; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm13[0,2,2,3]
7258; AVX-NEXT:    vinsertf128 $1, %xmm13, %ymm14, %ymm13
7259; AVX-NEXT:    vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7]
7260; AVX-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload
7261; AVX-NEXT:    # ymm13 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4]
7262; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7]
7263; AVX-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm6[2,3],ymm13[2,3]
7264; AVX-NEXT:    vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
7265; AVX-NEXT:    # ymm13 = mem[2,3,2,3]
7266; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7]
7267; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm13[0,1],ymm2[2,3,4,5],ymm13[6,7]
7268; AVX-NEXT:    vperm2f128 {{.*#+}} ymm13 = mem[2,3,2,3]
7269; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7]
7270; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4,5,6],ymm13[7]
7271; AVX-NEXT:    vbroadcastss 224(%rcx), %xmm13
7272; AVX-NEXT:    vbroadcastss 224(%rdx), %xmm14
7273; AVX-NEXT:    vunpcklps {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
7274; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7275; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
7276; AVX-NEXT:    # xmm14 = xmm0[0],mem[0],xmm0[1],mem[1]
7277; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm15
7278; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7]
7279; AVX-NEXT:    vinsertf128 $1, 224(%r8), %ymm14, %ymm14
7280; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7]
7281; AVX-NEXT:    vbroadcastss 224(%r9), %ymm14
7282; AVX-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7]
7283; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7284; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
7285; AVX-NEXT:    # xmm14 = xmm0[2],mem[2],xmm0[3],mem[3]
7286; AVX-NEXT:    vshufps {{.*#+}} xmm14 = xmm14[2,3,2,3]
7287; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7288; AVX-NEXT:    vinsertf128 $1, %xmm14, %ymm0, %ymm14
7289; AVX-NEXT:    vpermilps {{.*#+}} xmm15 = mem[2,1,3,3]
7290; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm15, %ymm15
7291; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7]
7292; AVX-NEXT:    vmovaps 224(%r9), %xmm15
7293; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm15[0,2,2,3]
7294; AVX-NEXT:    vinsertf128 $1, %xmm15, %ymm0, %ymm0
7295; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[2,3,4,5,6],ymm0[7]
7296; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
7297; AVX-NEXT:    vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload
7298; AVX-NEXT:    # ymm14 = ymm6[3,0],mem[3,0],ymm6[7,4],mem[7,4]
7299; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
7300; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm14[2,3]
7301; AVX-NEXT:    vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
7302; AVX-NEXT:    # ymm14 = mem[2,3,2,3]
7303; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[2,1,3,3,6,5,7,7]
7304; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm14[0,1],ymm5[2,3,4,5],ymm14[6,7]
7305; AVX-NEXT:    vperm2f128 {{.*#+}} ymm14 = mem[2,3,2,3]
7306; AVX-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7]
7307; AVX-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4,5,6],ymm14[7]
7308; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
7309; AVX-NEXT:    vmovaps %ymm5, 1504(%rax)
7310; AVX-NEXT:    vmovaps %ymm0, 1408(%rax)
7311; AVX-NEXT:    vmovaps %ymm13, 1344(%rax)
7312; AVX-NEXT:    vmovaps %ymm2, 1312(%rax)
7313; AVX-NEXT:    vmovaps %ymm12, 1216(%rax)
7314; AVX-NEXT:    vmovaps %ymm7, 1120(%rax)
7315; AVX-NEXT:    vmovaps %ymm11, 1024(%rax)
7316; AVX-NEXT:    vmovaps %ymm10, 928(%rax)
7317; AVX-NEXT:    vmovaps %ymm9, 832(%rax)
7318; AVX-NEXT:    vmovaps %ymm8, 768(%rax)
7319; AVX-NEXT:    vmovaps %ymm4, 736(%rax)
7320; AVX-NEXT:    vmovaps %ymm3, 640(%rax)
7321; AVX-NEXT:    vmovaps %ymm1, 544(%rax)
7322; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7323; AVX-NEXT:    vmovaps %ymm0, 448(%rax)
7324; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7325; AVX-NEXT:    vmovaps %ymm0, 384(%rax)
7326; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7327; AVX-NEXT:    vmovaps %ymm0, 352(%rax)
7328; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7329; AVX-NEXT:    vmovaps %ymm0, 256(%rax)
7330; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7331; AVX-NEXT:    vmovaps %ymm0, 192(%rax)
7332; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7333; AVX-NEXT:    vmovaps %ymm0, 160(%rax)
7334; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7335; AVX-NEXT:    vmovaps %ymm0, 64(%rax)
7336; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7337; AVX-NEXT:    vmovaps %ymm0, (%rax)
7338; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7339; AVX-NEXT:    vmovaps %ymm0, 1472(%rax)
7340; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7341; AVX-NEXT:    vmovaps %ymm0, 1280(%rax)
7342; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7343; AVX-NEXT:    vmovaps %ymm0, 1152(%rax)
7344; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7345; AVX-NEXT:    vmovaps %ymm0, 1088(%rax)
7346; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7347; AVX-NEXT:    vmovaps %ymm0, 960(%rax)
7348; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7349; AVX-NEXT:    vmovaps %ymm0, 896(%rax)
7350; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7351; AVX-NEXT:    vmovaps %ymm0, 704(%rax)
7352; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7353; AVX-NEXT:    vmovaps %ymm0, 576(%rax)
7354; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7355; AVX-NEXT:    vmovaps %ymm0, 512(%rax)
7356; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7357; AVX-NEXT:    vmovaps %ymm0, 320(%rax)
7358; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7359; AVX-NEXT:    vmovaps %ymm0, 128(%rax)
7360; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7361; AVX-NEXT:    vmovaps %ymm0, 1440(%rax)
7362; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7363; AVX-NEXT:    vmovaps %ymm0, 1376(%rax)
7364; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7365; AVX-NEXT:    vmovaps %ymm0, 1248(%rax)
7366; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7367; AVX-NEXT:    vmovaps %ymm0, 1184(%rax)
7368; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7369; AVX-NEXT:    vmovaps %ymm0, 1056(%rax)
7370; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7371; AVX-NEXT:    vmovaps %ymm0, 992(%rax)
7372; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7373; AVX-NEXT:    vmovaps %ymm0, 864(%rax)
7374; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7375; AVX-NEXT:    vmovaps %ymm0, 800(%rax)
7376; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7377; AVX-NEXT:    vmovaps %ymm0, 672(%rax)
7378; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7379; AVX-NEXT:    vmovaps %ymm0, 608(%rax)
7380; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7381; AVX-NEXT:    vmovaps %ymm0, 480(%rax)
7382; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7383; AVX-NEXT:    vmovaps %ymm0, 416(%rax)
7384; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7385; AVX-NEXT:    vmovaps %ymm0, 288(%rax)
7386; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7387; AVX-NEXT:    vmovaps %ymm0, 224(%rax)
7388; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7389; AVX-NEXT:    vmovaps %ymm0, 96(%rax)
7390; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7391; AVX-NEXT:    vmovaps %ymm0, 32(%rax)
7392; AVX-NEXT:    addq $2504, %rsp # imm = 0x9C8
7393; AVX-NEXT:    vzeroupper
7394; AVX-NEXT:    retq
7395;
7396; AVX2-LABEL: store_i32_stride6_vf64:
7397; AVX2:       # %bb.0:
7398; AVX2-NEXT:    subq $2504, %rsp # imm = 0x9C8
7399; AVX2-NEXT:    vmovdqa (%rsi), %xmm0
7400; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7401; AVX2-NEXT:    vmovdqa 32(%rsi), %xmm1
7402; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7403; AVX2-NEXT:    vmovdqa (%rdi), %xmm3
7404; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm10
7405; AVX2-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7406; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
7407; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7408; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm4
7409; AVX2-NEXT:    vmovdqa (%rcx), %xmm0
7410; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7411; AVX2-NEXT:    vmovdqa 32(%rcx), %xmm8
7412; AVX2-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7413; AVX2-NEXT:    vmovdqa 64(%rcx), %xmm7
7414; AVX2-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7415; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3]
7416; AVX2-NEXT:    vmovdqa (%rdx), %xmm0
7417; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7418; AVX2-NEXT:    vmovdqa 32(%rdx), %xmm9
7419; AVX2-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7420; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[1,2,2,3]
7421; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
7422; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1]
7423; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7]
7424; AVX2-NEXT:    vmovdqa (%r8), %xmm2
7425; AVX2-NEXT:    vmovdqa 32(%r8), %xmm6
7426; AVX2-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7427; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero
7428; AVX2-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7429; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7]
7430; AVX2-NEXT:    vpbroadcastd 4(%r9), %ymm5
7431; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7]
7432; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7433; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
7434; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7435; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3]
7436; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm9[1,2,2,3]
7437; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
7438; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm5
7439; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
7440; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7]
7441; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero
7442; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7]
7443; AVX2-NEXT:    vpbroadcastd 36(%r9), %ymm5
7444; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7]
7445; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7446; AVX2-NEXT:    vmovdqa 64(%rdx), %xmm0
7447; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7448; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3]
7449; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3]
7450; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
7451; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm4[0,1,2,1]
7452; AVX2-NEXT:    vmovdqa 64(%rsi), %xmm4
7453; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm5
7454; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
7455; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7456; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm7
7457; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7]
7458; AVX2-NEXT:    vmovdqa 64(%r8), %xmm0
7459; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7460; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero
7461; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7]
7462; AVX2-NEXT:    vpbroadcastd 68(%r9), %ymm7
7463; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7]
7464; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7465; AVX2-NEXT:    vmovdqa 96(%rcx), %xmm0
7466; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7467; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[1,2,2,3]
7468; AVX2-NEXT:    vmovdqa 96(%rdx), %xmm0
7469; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7470; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[1,2,2,3]
7471; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
7472; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm6[0,1,2,1]
7473; AVX2-NEXT:    vmovdqa 96(%rsi), %xmm6
7474; AVX2-NEXT:    vmovdqa 96(%rdi), %xmm7
7475; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
7476; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7477; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm9
7478; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7]
7479; AVX2-NEXT:    vmovdqa 96(%r8), %xmm0
7480; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7481; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero
7482; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7]
7483; AVX2-NEXT:    vpbroadcastd 100(%r9), %ymm9
7484; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7]
7485; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7486; AVX2-NEXT:    vmovdqa 128(%rcx), %xmm0
7487; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7488; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[1,2,2,3]
7489; AVX2-NEXT:    vmovdqa 128(%rdx), %xmm0
7490; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7491; AVX2-NEXT:    vpshufd {{.*#+}} xmm9 = xmm0[1,2,2,3]
7492; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
7493; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm8[0,1,2,1]
7494; AVX2-NEXT:    vmovdqa 128(%rsi), %xmm8
7495; AVX2-NEXT:    vmovdqa 128(%rdi), %xmm9
7496; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
7497; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7498; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm11
7499; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7]
7500; AVX2-NEXT:    vmovdqa 128(%r8), %xmm0
7501; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7502; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm11 = xmm0[0],zero,xmm0[1],zero
7503; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7]
7504; AVX2-NEXT:    vpbroadcastd 132(%r9), %ymm11
7505; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7]
7506; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7507; AVX2-NEXT:    vmovdqa 160(%rcx), %xmm0
7508; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7509; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm0[1,2,2,3]
7510; AVX2-NEXT:    vmovdqa 160(%rdx), %xmm0
7511; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7512; AVX2-NEXT:    vpshufd {{.*#+}} xmm11 = xmm0[1,2,2,3]
7513; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
7514; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm10[0,1,2,1]
7515; AVX2-NEXT:    vmovdqa 160(%rsi), %xmm10
7516; AVX2-NEXT:    vmovdqa 160(%rdi), %xmm11
7517; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
7518; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7519; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm13
7520; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7]
7521; AVX2-NEXT:    vmovdqa 160(%r8), %xmm0
7522; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7523; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero
7524; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7]
7525; AVX2-NEXT:    vpbroadcastd 164(%r9), %ymm13
7526; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7]
7527; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7528; AVX2-NEXT:    vmovdqa 192(%rcx), %xmm0
7529; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7530; AVX2-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[1,2,2,3]
7531; AVX2-NEXT:    vmovdqa 192(%rdx), %xmm0
7532; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7533; AVX2-NEXT:    vpshufd {{.*#+}} xmm13 = xmm0[1,2,2,3]
7534; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
7535; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm12[0,1,2,1]
7536; AVX2-NEXT:    vmovdqa 192(%rsi), %xmm12
7537; AVX2-NEXT:    vmovdqa 192(%rdi), %xmm13
7538; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
7539; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7540; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm15
7541; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7]
7542; AVX2-NEXT:    vmovdqa 192(%r8), %xmm0
7543; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7544; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm15 = xmm0[0],zero,xmm0[1],zero
7545; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7]
7546; AVX2-NEXT:    vpbroadcastd 196(%r9), %ymm15
7547; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7]
7548; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7549; AVX2-NEXT:    vmovdqa 224(%rcx), %xmm0
7550; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7551; AVX2-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[1,2,2,3]
7552; AVX2-NEXT:    vmovdqa 224(%rdx), %xmm0
7553; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7554; AVX2-NEXT:    vpshufd {{.*#+}} xmm15 = xmm0[1,2,2,3]
7555; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
7556; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm14[0,1,2,1]
7557; AVX2-NEXT:    vmovdqa 224(%rsi), %xmm14
7558; AVX2-NEXT:    vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7559; AVX2-NEXT:    vmovdqa 224(%rdi), %xmm0
7560; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7561; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm14 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
7562; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7563; AVX2-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
7564; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7]
7565; AVX2-NEXT:    vmovdqa 224(%r8), %xmm14
7566; AVX2-NEXT:    vmovdqa %xmm14, (%rsp) # 16-byte Spill
7567; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero
7568; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7]
7569; AVX2-NEXT:    vpbroadcastd 228(%r9), %ymm14
7570; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3],ymm0[4,5,6,7]
7571; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7572; AVX2-NEXT:    vpbroadcastd (%rcx), %xmm0
7573; AVX2-NEXT:    vpbroadcastd (%rdx), %xmm14
7574; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
7575; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload
7576; AVX2-NEXT:    # xmm1 = xmm3[0],mem[0],xmm3[1],mem[1]
7577; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
7578; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
7579; AVX2-NEXT:    vpbroadcastq %xmm2, %ymm1
7580; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
7581; AVX2-NEXT:    vmovdqa (%r9), %xmm1
7582; AVX2-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7583; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
7584; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
7585; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7586; AVX2-NEXT:    vmovdqa (%rdx), %ymm0
7587; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7588; AVX2-NEXT:    vmovdqa (%rcx), %ymm1
7589; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7590; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,2,4,5,6,6]
7591; AVX2-NEXT:    vpshufd {{.*#+}} ymm14 = ymm0[1,1,2,3,5,5,6,7]
7592; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2],ymm2[3],ymm14[4],ymm2[5],ymm14[6],ymm2[7]
7593; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
7594; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
7595; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7596; AVX2-NEXT:    vmovdqa (%rsi), %ymm1
7597; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7598; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
7599; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7600; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
7601; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero
7602; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3],ymm2[4,5,6,7]
7603; AVX2-NEXT:    vpbroadcastd 20(%r9), %ymm14
7604; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm14[3],ymm2[4,5,6,7]
7605; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7606; AVX2-NEXT:    vbroadcastss 32(%rcx), %xmm2
7607; AVX2-NEXT:    vbroadcastss 32(%rdx), %xmm14
7608; AVX2-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm14[0],xmm2[0],xmm14[1],xmm2[1]
7609; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7610; AVX2-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
7611; AVX2-NEXT:    # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1]
7612; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1]
7613; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7]
7614; AVX2-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload
7615; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
7616; AVX2-NEXT:    vmovaps 32(%r9), %xmm0
7617; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7618; AVX2-NEXT:    vbroadcastss %xmm0, %ymm3
7619; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
7620; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7621; AVX2-NEXT:    vmovdqa 32(%rdx), %ymm0
7622; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7623; AVX2-NEXT:    vmovdqa 32(%rcx), %ymm3
7624; AVX2-NEXT:    vpshufd {{.*#+}} ymm14 = ymm3[0,1,2,2,4,5,6,6]
7625; AVX2-NEXT:    vpshufd {{.*#+}} ymm15 = ymm0[1,1,2,3,5,5,6,7]
7626; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7]
7627; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
7628; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
7629; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7630; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm1
7631; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7632; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
7633; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7634; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7]
7635; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero
7636; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7]
7637; AVX2-NEXT:    vpbroadcastd 52(%r9), %ymm15
7638; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7]
7639; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7640; AVX2-NEXT:    vpbroadcastd 64(%rcx), %xmm14
7641; AVX2-NEXT:    vpbroadcastd 64(%rdx), %xmm15
7642; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
7643; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
7644; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
7645; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3],ymm4[4,5,6,7]
7646; AVX2-NEXT:    vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload
7647; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7]
7648; AVX2-NEXT:    vmovdqa 64(%r9), %xmm0
7649; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7650; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm5
7651; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7]
7652; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7653; AVX2-NEXT:    vmovdqa 64(%rdx), %ymm5
7654; AVX2-NEXT:    vmovdqa 64(%rcx), %ymm4
7655; AVX2-NEXT:    vpshufd {{.*#+}} ymm14 = ymm4[0,1,2,2,4,5,6,6]
7656; AVX2-NEXT:    vpshufd {{.*#+}} ymm15 = ymm5[1,1,2,3,5,5,6,7]
7657; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7]
7658; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
7659; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm0
7660; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7661; AVX2-NEXT:    vmovdqa 64(%rsi), %ymm1
7662; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7663; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
7664; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7665; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7]
7666; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero
7667; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7]
7668; AVX2-NEXT:    vpbroadcastd 84(%r9), %ymm15
7669; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7]
7670; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7671; AVX2-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
7672; AVX2-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
7673; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
7674; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
7675; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1]
7676; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm14[2,3],ymm6[4,5,6,7]
7677; AVX2-NEXT:    vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload
7678; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7]
7679; AVX2-NEXT:    vmovdqa 96(%r9), %xmm0
7680; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7681; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm7
7682; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
7683; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7684; AVX2-NEXT:    vmovdqa 96(%rdx), %ymm7
7685; AVX2-NEXT:    vmovdqa 96(%rcx), %ymm6
7686; AVX2-NEXT:    vpshufd {{.*#+}} ymm14 = ymm6[0,1,2,2,4,5,6,6]
7687; AVX2-NEXT:    vpshufd {{.*#+}} ymm15 = ymm7[1,1,2,3,5,5,6,7]
7688; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7]
7689; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
7690; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm0
7691; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7692; AVX2-NEXT:    vmovdqa 96(%rsi), %ymm1
7693; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7694; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
7695; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7696; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7]
7697; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero
7698; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7]
7699; AVX2-NEXT:    vpbroadcastd 116(%r9), %ymm15
7700; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7]
7701; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7702; AVX2-NEXT:    vpbroadcastd 128(%rcx), %xmm14
7703; AVX2-NEXT:    vpbroadcastd 128(%rdx), %xmm15
7704; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
7705; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
7706; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1]
7707; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm14[2,3],ymm8[4,5,6,7]
7708; AVX2-NEXT:    vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload
7709; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7]
7710; AVX2-NEXT:    vmovdqa 128(%r9), %xmm0
7711; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7712; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm9
7713; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7]
7714; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7715; AVX2-NEXT:    vmovdqa 128(%rdx), %ymm9
7716; AVX2-NEXT:    vmovdqa 128(%rcx), %ymm8
7717; AVX2-NEXT:    vpshufd {{.*#+}} ymm14 = ymm8[0,1,2,2,4,5,6,6]
7718; AVX2-NEXT:    vpshufd {{.*#+}} ymm15 = ymm9[1,1,2,3,5,5,6,7]
7719; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7]
7720; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
7721; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm0
7722; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7723; AVX2-NEXT:    vmovdqa 128(%rsi), %ymm1
7724; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7725; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
7726; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7727; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7]
7728; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero
7729; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7]
7730; AVX2-NEXT:    vpbroadcastd 148(%r9), %ymm15
7731; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7]
7732; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7733; AVX2-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
7734; AVX2-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
7735; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
7736; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
7737; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1]
7738; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm14[2,3],ymm10[4,5,6,7]
7739; AVX2-NEXT:    vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 16-byte Folded Reload
7740; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7]
7741; AVX2-NEXT:    vmovdqa 160(%r9), %xmm0
7742; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7743; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm11
7744; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7]
7745; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7746; AVX2-NEXT:    vmovdqa 160(%rdx), %ymm11
7747; AVX2-NEXT:    vmovdqa 160(%rcx), %ymm10
7748; AVX2-NEXT:    vpshufd {{.*#+}} ymm14 = ymm10[0,1,2,2,4,5,6,6]
7749; AVX2-NEXT:    vpshufd {{.*#+}} ymm15 = ymm11[1,1,2,3,5,5,6,7]
7750; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7]
7751; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
7752; AVX2-NEXT:    vmovdqa 160(%rdi), %ymm0
7753; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7754; AVX2-NEXT:    vmovdqa 160(%rsi), %ymm1
7755; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7756; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
7757; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7758; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7]
7759; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero
7760; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7]
7761; AVX2-NEXT:    vpbroadcastd 180(%r9), %ymm15
7762; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7]
7763; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7764; AVX2-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
7765; AVX2-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
7766; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
7767; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
7768; AVX2-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1]
7769; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7]
7770; AVX2-NEXT:    vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload
7771; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7]
7772; AVX2-NEXT:    vmovdqa 192(%r9), %xmm0
7773; AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7774; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm13
7775; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7]
7776; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7777; AVX2-NEXT:    vmovdqa 192(%rdx), %ymm13
7778; AVX2-NEXT:    vmovdqa 192(%rcx), %ymm12
7779; AVX2-NEXT:    vpshufd {{.*#+}} ymm14 = ymm12[0,1,2,2,4,5,6,6]
7780; AVX2-NEXT:    vpshufd {{.*#+}} ymm15 = ymm13[1,1,2,3,5,5,6,7]
7781; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7]
7782; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
7783; AVX2-NEXT:    vmovdqa 192(%rdi), %ymm0
7784; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7785; AVX2-NEXT:    vmovdqa 192(%rsi), %ymm1
7786; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7787; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
7788; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7789; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7]
7790; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero
7791; AVX2-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7]
7792; AVX2-NEXT:    vpbroadcastd 212(%r9), %ymm15
7793; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7]
7794; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7795; AVX2-NEXT:    vbroadcastss 224(%rcx), %xmm14
7796; AVX2-NEXT:    vbroadcastss 224(%rdx), %xmm15
7797; AVX2-NEXT:    vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
7798; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7799; AVX2-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload
7800; AVX2-NEXT:    # xmm15 = xmm0[0],mem[0],xmm0[1],mem[1]
7801; AVX2-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1]
7802; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7]
7803; AVX2-NEXT:    vbroadcastsd (%rsp), %ymm15 # 16-byte Folded Reload
7804; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7]
7805; AVX2-NEXT:    vmovaps 224(%r9), %xmm0
7806; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7807; AVX2-NEXT:    vbroadcastss %xmm0, %ymm15
7808; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
7809; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7810; AVX2-NEXT:    vmovdqa 224(%rdx), %ymm14
7811; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7812; AVX2-NEXT:    vmovdqa 224(%rcx), %ymm0
7813; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7814; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6]
7815; AVX2-NEXT:    vpshufd {{.*#+}} ymm15 = ymm14[1,1,2,3,5,5,6,7]
7816; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2],ymm0[3],ymm15[4],ymm0[5],ymm15[6],ymm0[7]
7817; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,1,2,3]
7818; AVX2-NEXT:    vmovdqa 224(%rdi), %ymm14
7819; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7820; AVX2-NEXT:    vmovdqa 224(%rsi), %ymm0
7821; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7822; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm14 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7]
7823; AVX2-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7824; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7]
7825; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero
7826; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7]
7827; AVX2-NEXT:    vpbroadcastd 244(%r9), %ymm15
7828; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7]
7829; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7830; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7831; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
7832; AVX2-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
7833; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
7834; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
7835; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm14, %ymm0
7836; AVX2-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
7837; AVX2-NEXT:    # xmm15 = mem[2,2,3,3]
7838; AVX2-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1]
7839; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5],ymm15[6,7]
7840; AVX2-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
7841; AVX2-NEXT:    # xmm15 = mem[2,2,3,3]
7842; AVX2-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1]
7843; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7]
7844; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7845; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7846; AVX2-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7847; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
7848; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7849; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7850; AVX2-NEXT:    vunpcklps {{.*#+}} ymm15 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
7851; AVX2-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
7852; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,2,3]
7853; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7]
7854; AVX2-NEXT:    vmovaps (%r8), %ymm15
7855; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7]
7856; AVX2-NEXT:    vbroadcastss 16(%r9), %ymm14
7857; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
7858; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7859; AVX2-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
7860; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
7861; AVX2-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7862; AVX2-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
7863; AVX2-NEXT:    vshufps {{.*#+}} ymm1 = ymm15[2,1,3,3,6,5,7,7]
7864; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
7865; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
7866; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7]
7867; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
7868; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
7869; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7870; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7871; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
7872; AVX2-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
7873; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
7874; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7875; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
7876; AVX2-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7877; AVX2-NEXT:    # xmm1 = mem[2,2,3,3]
7878; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
7879; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
7880; AVX2-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7881; AVX2-NEXT:    # xmm1 = mem[2,2,3,3]
7882; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
7883; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
7884; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7885; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7886; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7887; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
7888; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7889; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
7890; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
7891; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
7892; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
7893; AVX2-NEXT:    vmovdqa 32(%r8), %ymm1
7894; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
7895; AVX2-NEXT:    vpbroadcastd 48(%r9), %ymm14
7896; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
7897; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7898; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
7899; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
7900; AVX2-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7901; AVX2-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
7902; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
7903; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
7904; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
7905; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7]
7906; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
7907; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
7908; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7909; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7910; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
7911; AVX2-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
7912; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
7913; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7914; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
7915; AVX2-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7916; AVX2-NEXT:    # xmm1 = mem[2,2,3,3]
7917; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
7918; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
7919; AVX2-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7920; AVX2-NEXT:    # xmm1 = mem[2,2,3,3]
7921; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
7922; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
7923; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7924; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7925; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7926; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
7927; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5]
7928; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
7929; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
7930; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
7931; AVX2-NEXT:    vmovdqa 64(%r8), %ymm1
7932; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
7933; AVX2-NEXT:    vpbroadcastd 80(%r9), %ymm2
7934; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
7935; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7936; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7]
7937; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
7938; AVX2-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7939; AVX2-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
7940; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
7941; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
7942; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
7943; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7]
7944; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
7945; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
7946; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7947; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7948; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
7949; AVX2-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
7950; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
7951; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7952; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
7953; AVX2-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7954; AVX2-NEXT:    # xmm1 = mem[2,2,3,3]
7955; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
7956; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
7957; AVX2-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7958; AVX2-NEXT:    # xmm1 = mem[2,2,3,3]
7959; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
7960; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
7961; AVX2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7962; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7963; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7964; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
7965; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm1 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
7966; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
7967; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
7968; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
7969; AVX2-NEXT:    vmovdqa 96(%r8), %ymm1
7970; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
7971; AVX2-NEXT:    vpbroadcastd 112(%r9), %ymm2
7972; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
7973; AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7974; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7]
7975; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
7976; AVX2-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
7977; AVX2-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
7978; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
7979; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
7980; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
7981; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7]
7982; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
7983; AVX2-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
7984; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7985; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
7986; AVX2-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
7987; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
7988; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7989; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
7990; AVX2-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7991; AVX2-NEXT:    # xmm1 = mem[2,2,3,3]
7992; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
7993; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
7994; AVX2-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7995; AVX2-NEXT:    # xmm1 = mem[2,2,3,3]
7996; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
7997; AVX2-NEXT:    vblendps {{.*#+}} ymm7 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
7998; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7999; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8000; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
8001; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm1 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5]
8002; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
8003; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
8004; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
8005; AVX2-NEXT:    vmovdqa 128(%r8), %ymm1
8006; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
8007; AVX2-NEXT:    vpbroadcastd 144(%r9), %ymm2
8008; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
8009; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7]
8010; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
8011; AVX2-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8012; AVX2-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
8013; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
8014; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
8015; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
8016; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7]
8017; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
8018; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
8019; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8020; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
8021; AVX2-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
8022; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
8023; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8024; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
8025; AVX2-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8026; AVX2-NEXT:    # xmm1 = mem[2,2,3,3]
8027; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
8028; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
8029; AVX2-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8030; AVX2-NEXT:    # xmm1 = mem[2,2,3,3]
8031; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
8032; AVX2-NEXT:    vblendps {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
8033; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8034; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8035; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
8036; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm1 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5]
8037; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
8038; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
8039; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
8040; AVX2-NEXT:    vmovdqa 160(%r8), %ymm14
8041; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7]
8042; AVX2-NEXT:    vpbroadcastd 176(%r9), %ymm1
8043; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
8044; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7]
8045; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
8046; AVX2-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8047; AVX2-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
8048; AVX2-NEXT:    vpshufd {{.*#+}} ymm10 = ymm14[2,1,3,3,6,5,7,7]
8049; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3]
8050; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3,4,5],ymm10[6,7]
8051; AVX2-NEXT:    vpshufd {{.*#+}} ymm10 = mem[0,2,2,3,4,6,6,7]
8052; AVX2-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3]
8053; AVX2-NEXT:    vpblendd {{.*#+}} ymm10 = ymm0[0],ymm10[1],ymm0[2,3,4,5,6],ymm10[7]
8054; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8055; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
8056; AVX2-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
8057; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
8058; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
8059; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
8060; AVX2-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
8061; AVX2-NEXT:    # xmm11 = mem[2,2,3,3]
8062; AVX2-NEXT:    vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1]
8063; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5],ymm11[6,7]
8064; AVX2-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
8065; AVX2-NEXT:    # xmm11 = mem[2,2,3,3]
8066; AVX2-NEXT:    vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1]
8067; AVX2-NEXT:    vblendps {{.*#+}} ymm11 = ymm0[0],ymm11[1],ymm0[2,3,4,5,6],ymm11[7]
8068; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8069; AVX2-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8070; AVX2-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
8071; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm14 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5]
8072; AVX2-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
8073; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
8074; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7]
8075; AVX2-NEXT:    vmovdqa 192(%r8), %ymm14
8076; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7]
8077; AVX2-NEXT:    vpbroadcastd 208(%r9), %ymm15
8078; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7]
8079; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm12 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7]
8080; AVX2-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,3,6,7,6,7]
8081; AVX2-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
8082; AVX2-NEXT:    # ymm12 = mem[2,3],ymm12[2,3]
8083; AVX2-NEXT:    vpshufd {{.*#+}} ymm13 = ymm14[2,1,3,3,6,5,7,7]
8084; AVX2-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3]
8085; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7]
8086; AVX2-NEXT:    vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7]
8087; AVX2-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3]
8088; AVX2-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7]
8089; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8090; AVX2-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm13 # 16-byte Folded Reload
8091; AVX2-NEXT:    # xmm13 = xmm4[2],mem[2],xmm4[3],mem[3]
8092; AVX2-NEXT:    vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3]
8093; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
8094; AVX2-NEXT:    vinsertf128 $1, %xmm13, %ymm4, %ymm13
8095; AVX2-NEXT:    vpermilps $250, (%rsp), %xmm14 # 16-byte Folded Reload
8096; AVX2-NEXT:    # xmm14 = mem[2,2,3,3]
8097; AVX2-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1]
8098; AVX2-NEXT:    vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7]
8099; AVX2-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
8100; AVX2-NEXT:    # xmm14 = mem[2,2,3,3]
8101; AVX2-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1]
8102; AVX2-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6],ymm14[7]
8103; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8104; AVX2-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
8105; AVX2-NEXT:    # ymm14 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
8106; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8107; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8108; AVX2-NEXT:    vunpcklps {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
8109; AVX2-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
8110; AVX2-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3]
8111; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7]
8112; AVX2-NEXT:    vmovaps 224(%r8), %ymm15
8113; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7]
8114; AVX2-NEXT:    vbroadcastss 240(%r9), %ymm4
8115; AVX2-NEXT:    vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7]
8116; AVX2-NEXT:    vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
8117; AVX2-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7]
8118; AVX2-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
8119; AVX2-NEXT:    # ymm14 = mem[2,3],ymm14[2,3]
8120; AVX2-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7]
8121; AVX2-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3]
8122; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7]
8123; AVX2-NEXT:    vpermilps {{.*#+}} ymm15 = mem[0,2,2,3,4,6,6,7]
8124; AVX2-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3]
8125; AVX2-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7]
8126; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
8127; AVX2-NEXT:    vmovaps %ymm14, 1504(%rax)
8128; AVX2-NEXT:    vmovaps %ymm4, 1440(%rax)
8129; AVX2-NEXT:    vmovaps %ymm13, 1408(%rax)
8130; AVX2-NEXT:    vmovdqa %ymm12, 1312(%rax)
8131; AVX2-NEXT:    vmovdqa %ymm2, 1248(%rax)
8132; AVX2-NEXT:    vmovaps %ymm11, 1216(%rax)
8133; AVX2-NEXT:    vmovdqa %ymm10, 1120(%rax)
8134; AVX2-NEXT:    vmovdqa %ymm3, 1056(%rax)
8135; AVX2-NEXT:    vmovaps %ymm9, 1024(%rax)
8136; AVX2-NEXT:    vmovdqa %ymm8, 928(%rax)
8137; AVX2-NEXT:    vmovdqa %ymm5, 864(%rax)
8138; AVX2-NEXT:    vmovaps %ymm7, 832(%rax)
8139; AVX2-NEXT:    vmovdqa %ymm6, 736(%rax)
8140; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8141; AVX2-NEXT:    vmovaps %ymm0, 672(%rax)
8142; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8143; AVX2-NEXT:    vmovaps %ymm0, 640(%rax)
8144; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8145; AVX2-NEXT:    vmovaps %ymm0, 544(%rax)
8146; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8147; AVX2-NEXT:    vmovaps %ymm0, 480(%rax)
8148; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8149; AVX2-NEXT:    vmovaps %ymm0, 448(%rax)
8150; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8151; AVX2-NEXT:    vmovaps %ymm0, 352(%rax)
8152; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8153; AVX2-NEXT:    vmovaps %ymm0, 288(%rax)
8154; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8155; AVX2-NEXT:    vmovaps %ymm0, 256(%rax)
8156; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8157; AVX2-NEXT:    vmovaps %ymm0, 160(%rax)
8158; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8159; AVX2-NEXT:    vmovaps %ymm0, 96(%rax)
8160; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8161; AVX2-NEXT:    vmovaps %ymm0, 64(%rax)
8162; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8163; AVX2-NEXT:    vmovaps %ymm0, 1472(%rax)
8164; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8165; AVX2-NEXT:    vmovaps %ymm0, 1344(%rax)
8166; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8167; AVX2-NEXT:    vmovaps %ymm0, 1280(%rax)
8168; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8169; AVX2-NEXT:    vmovaps %ymm0, 1152(%rax)
8170; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8171; AVX2-NEXT:    vmovaps %ymm0, 1088(%rax)
8172; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8173; AVX2-NEXT:    vmovaps %ymm0, 960(%rax)
8174; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8175; AVX2-NEXT:    vmovaps %ymm0, 896(%rax)
8176; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8177; AVX2-NEXT:    vmovaps %ymm0, 768(%rax)
8178; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8179; AVX2-NEXT:    vmovaps %ymm0, 704(%rax)
8180; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8181; AVX2-NEXT:    vmovaps %ymm0, 576(%rax)
8182; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8183; AVX2-NEXT:    vmovaps %ymm0, 512(%rax)
8184; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8185; AVX2-NEXT:    vmovaps %ymm0, 384(%rax)
8186; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8187; AVX2-NEXT:    vmovaps %ymm0, 320(%rax)
8188; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8189; AVX2-NEXT:    vmovaps %ymm0, 192(%rax)
8190; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8191; AVX2-NEXT:    vmovaps %ymm0, 128(%rax)
8192; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8193; AVX2-NEXT:    vmovaps %ymm0, (%rax)
8194; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8195; AVX2-NEXT:    vmovaps %ymm0, 1376(%rax)
8196; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8197; AVX2-NEXT:    vmovaps %ymm0, 1184(%rax)
8198; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8199; AVX2-NEXT:    vmovaps %ymm0, 992(%rax)
8200; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8201; AVX2-NEXT:    vmovaps %ymm0, 800(%rax)
8202; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8203; AVX2-NEXT:    vmovaps %ymm0, 608(%rax)
8204; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8205; AVX2-NEXT:    vmovaps %ymm0, 416(%rax)
8206; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8207; AVX2-NEXT:    vmovaps %ymm0, 224(%rax)
8208; AVX2-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8209; AVX2-NEXT:    vmovaps %ymm0, 32(%rax)
8210; AVX2-NEXT:    addq $2504, %rsp # imm = 0x9C8
8211; AVX2-NEXT:    vzeroupper
8212; AVX2-NEXT:    retq
8213;
8214; AVX2-FP-LABEL: store_i32_stride6_vf64:
8215; AVX2-FP:       # %bb.0:
8216; AVX2-FP-NEXT:    subq $2504, %rsp # imm = 0x9C8
8217; AVX2-FP-NEXT:    vmovdqa (%rsi), %xmm0
8218; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8219; AVX2-FP-NEXT:    vmovdqa 32(%rsi), %xmm1
8220; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8221; AVX2-FP-NEXT:    vmovdqa (%rdi), %xmm3
8222; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %xmm10
8223; AVX2-FP-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8224; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
8225; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8226; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm4
8227; AVX2-FP-NEXT:    vmovdqa (%rcx), %xmm0
8228; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8229; AVX2-FP-NEXT:    vmovdqa 32(%rcx), %xmm8
8230; AVX2-FP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8231; AVX2-FP-NEXT:    vmovdqa 64(%rcx), %xmm7
8232; AVX2-FP-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8233; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3]
8234; AVX2-FP-NEXT:    vmovdqa (%rdx), %xmm0
8235; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8236; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %xmm9
8237; AVX2-FP-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8238; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[1,2,2,3]
8239; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
8240; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1]
8241; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7]
8242; AVX2-FP-NEXT:    vmovdqa (%r8), %xmm2
8243; AVX2-FP-NEXT:    vmovdqa 32(%r8), %xmm6
8244; AVX2-FP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8245; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero
8246; AVX2-FP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8247; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7]
8248; AVX2-FP-NEXT:    vpbroadcastd 4(%r9), %ymm5
8249; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7]
8250; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8251; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
8252; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8253; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3]
8254; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm9[1,2,2,3]
8255; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
8256; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm5
8257; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
8258; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7]
8259; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero
8260; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7]
8261; AVX2-FP-NEXT:    vpbroadcastd 36(%r9), %ymm5
8262; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7]
8263; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8264; AVX2-FP-NEXT:    vmovdqa 64(%rdx), %xmm0
8265; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8266; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3]
8267; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3]
8268; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
8269; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm4[0,1,2,1]
8270; AVX2-FP-NEXT:    vmovdqa 64(%rsi), %xmm4
8271; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %xmm5
8272; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
8273; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8274; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm7
8275; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7]
8276; AVX2-FP-NEXT:    vmovdqa 64(%r8), %xmm0
8277; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8278; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero
8279; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7]
8280; AVX2-FP-NEXT:    vpbroadcastd 68(%r9), %ymm7
8281; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7]
8282; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8283; AVX2-FP-NEXT:    vmovdqa 96(%rcx), %xmm0
8284; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8285; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[1,2,2,3]
8286; AVX2-FP-NEXT:    vmovdqa 96(%rdx), %xmm0
8287; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8288; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[1,2,2,3]
8289; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
8290; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm6[0,1,2,1]
8291; AVX2-FP-NEXT:    vmovdqa 96(%rsi), %xmm6
8292; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %xmm7
8293; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
8294; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8295; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm9
8296; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7]
8297; AVX2-FP-NEXT:    vmovdqa 96(%r8), %xmm0
8298; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8299; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero
8300; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7]
8301; AVX2-FP-NEXT:    vpbroadcastd 100(%r9), %ymm9
8302; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7]
8303; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8304; AVX2-FP-NEXT:    vmovdqa 128(%rcx), %xmm0
8305; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8306; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[1,2,2,3]
8307; AVX2-FP-NEXT:    vmovdqa 128(%rdx), %xmm0
8308; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8309; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm9 = xmm0[1,2,2,3]
8310; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
8311; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm8[0,1,2,1]
8312; AVX2-FP-NEXT:    vmovdqa 128(%rsi), %xmm8
8313; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %xmm9
8314; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
8315; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8316; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm11
8317; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7]
8318; AVX2-FP-NEXT:    vmovdqa 128(%r8), %xmm0
8319; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8320; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm11 = xmm0[0],zero,xmm0[1],zero
8321; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7]
8322; AVX2-FP-NEXT:    vpbroadcastd 132(%r9), %ymm11
8323; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7]
8324; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8325; AVX2-FP-NEXT:    vmovdqa 160(%rcx), %xmm0
8326; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8327; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm10 = xmm0[1,2,2,3]
8328; AVX2-FP-NEXT:    vmovdqa 160(%rdx), %xmm0
8329; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8330; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm11 = xmm0[1,2,2,3]
8331; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
8332; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm10[0,1,2,1]
8333; AVX2-FP-NEXT:    vmovdqa 160(%rsi), %xmm10
8334; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %xmm11
8335; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
8336; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8337; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm13
8338; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7]
8339; AVX2-FP-NEXT:    vmovdqa 160(%r8), %xmm0
8340; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8341; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero
8342; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7]
8343; AVX2-FP-NEXT:    vpbroadcastd 164(%r9), %ymm13
8344; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7]
8345; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8346; AVX2-FP-NEXT:    vmovdqa 192(%rcx), %xmm0
8347; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8348; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm12 = xmm0[1,2,2,3]
8349; AVX2-FP-NEXT:    vmovdqa 192(%rdx), %xmm0
8350; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8351; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm13 = xmm0[1,2,2,3]
8352; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
8353; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm12[0,1,2,1]
8354; AVX2-FP-NEXT:    vmovdqa 192(%rsi), %xmm12
8355; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %xmm13
8356; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
8357; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8358; AVX2-FP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm15
8359; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7]
8360; AVX2-FP-NEXT:    vmovdqa 192(%r8), %xmm0
8361; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8362; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm15 = xmm0[0],zero,xmm0[1],zero
8363; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7]
8364; AVX2-FP-NEXT:    vpbroadcastd 196(%r9), %ymm15
8365; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7]
8366; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8367; AVX2-FP-NEXT:    vmovdqa 224(%rcx), %xmm0
8368; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8369; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[1,2,2,3]
8370; AVX2-FP-NEXT:    vmovdqa 224(%rdx), %xmm0
8371; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8372; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm15 = xmm0[1,2,2,3]
8373; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
8374; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm14[0,1,2,1]
8375; AVX2-FP-NEXT:    vmovdqa 224(%rsi), %xmm14
8376; AVX2-FP-NEXT:    vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8377; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %xmm0
8378; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8379; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} xmm14 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
8380; AVX2-FP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8381; AVX2-FP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
8382; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7]
8383; AVX2-FP-NEXT:    vmovdqa 224(%r8), %xmm14
8384; AVX2-FP-NEXT:    vmovdqa %xmm14, (%rsp) # 16-byte Spill
8385; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero
8386; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7]
8387; AVX2-FP-NEXT:    vpbroadcastd 228(%r9), %ymm14
8388; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3],ymm0[4,5,6,7]
8389; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8390; AVX2-FP-NEXT:    vpbroadcastd (%rcx), %xmm0
8391; AVX2-FP-NEXT:    vpbroadcastd (%rdx), %xmm14
8392; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
8393; AVX2-FP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload
8394; AVX2-FP-NEXT:    # xmm1 = xmm3[0],mem[0],xmm3[1],mem[1]
8395; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
8396; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
8397; AVX2-FP-NEXT:    vpbroadcastq %xmm2, %ymm1
8398; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
8399; AVX2-FP-NEXT:    vmovdqa (%r9), %xmm1
8400; AVX2-FP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8401; AVX2-FP-NEXT:    vpbroadcastd %xmm1, %ymm1
8402; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
8403; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8404; AVX2-FP-NEXT:    vmovdqa (%rdx), %ymm0
8405; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8406; AVX2-FP-NEXT:    vmovdqa (%rcx), %ymm1
8407; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8408; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,2,4,5,6,6]
8409; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm0[1,1,2,3,5,5,6,7]
8410; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2],ymm2[3],ymm14[4],ymm2[5],ymm14[6],ymm2[7]
8411; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
8412; AVX2-FP-NEXT:    vmovdqa (%rdi), %ymm0
8413; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8414; AVX2-FP-NEXT:    vmovdqa (%rsi), %ymm1
8415; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8416; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
8417; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8418; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
8419; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero
8420; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3],ymm2[4,5,6,7]
8421; AVX2-FP-NEXT:    vpbroadcastd 20(%r9), %ymm14
8422; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm14[3],ymm2[4,5,6,7]
8423; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8424; AVX2-FP-NEXT:    vbroadcastss 32(%rcx), %xmm2
8425; AVX2-FP-NEXT:    vbroadcastss 32(%rdx), %xmm14
8426; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm14[0],xmm2[0],xmm14[1],xmm2[1]
8427; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8428; AVX2-FP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
8429; AVX2-FP-NEXT:    # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1]
8430; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1]
8431; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7]
8432; AVX2-FP-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload
8433; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
8434; AVX2-FP-NEXT:    vmovaps 32(%r9), %xmm0
8435; AVX2-FP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8436; AVX2-FP-NEXT:    vbroadcastss %xmm0, %ymm3
8437; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
8438; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8439; AVX2-FP-NEXT:    vmovdqa 32(%rdx), %ymm0
8440; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8441; AVX2-FP-NEXT:    vmovdqa 32(%rcx), %ymm3
8442; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm3[0,1,2,2,4,5,6,6]
8443; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm0[1,1,2,3,5,5,6,7]
8444; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7]
8445; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
8446; AVX2-FP-NEXT:    vmovdqa 32(%rdi), %ymm0
8447; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8448; AVX2-FP-NEXT:    vmovdqa 32(%rsi), %ymm1
8449; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8450; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
8451; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8452; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7]
8453; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero
8454; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7]
8455; AVX2-FP-NEXT:    vpbroadcastd 52(%r9), %ymm15
8456; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7]
8457; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8458; AVX2-FP-NEXT:    vpbroadcastd 64(%rcx), %xmm14
8459; AVX2-FP-NEXT:    vpbroadcastd 64(%rdx), %xmm15
8460; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
8461; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
8462; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
8463; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3],ymm4[4,5,6,7]
8464; AVX2-FP-NEXT:    vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload
8465; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7]
8466; AVX2-FP-NEXT:    vmovdqa 64(%r9), %xmm0
8467; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8468; AVX2-FP-NEXT:    vpbroadcastd %xmm0, %ymm5
8469; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7]
8470; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8471; AVX2-FP-NEXT:    vmovdqa 64(%rdx), %ymm5
8472; AVX2-FP-NEXT:    vmovdqa 64(%rcx), %ymm4
8473; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm4[0,1,2,2,4,5,6,6]
8474; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm5[1,1,2,3,5,5,6,7]
8475; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7]
8476; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
8477; AVX2-FP-NEXT:    vmovdqa 64(%rdi), %ymm0
8478; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8479; AVX2-FP-NEXT:    vmovdqa 64(%rsi), %ymm1
8480; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8481; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
8482; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8483; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7]
8484; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero
8485; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7]
8486; AVX2-FP-NEXT:    vpbroadcastd 84(%r9), %ymm15
8487; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7]
8488; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8489; AVX2-FP-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
8490; AVX2-FP-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
8491; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
8492; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
8493; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1]
8494; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm14[2,3],ymm6[4,5,6,7]
8495; AVX2-FP-NEXT:    vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload
8496; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7]
8497; AVX2-FP-NEXT:    vmovdqa 96(%r9), %xmm0
8498; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8499; AVX2-FP-NEXT:    vpbroadcastd %xmm0, %ymm7
8500; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
8501; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8502; AVX2-FP-NEXT:    vmovdqa 96(%rdx), %ymm7
8503; AVX2-FP-NEXT:    vmovdqa 96(%rcx), %ymm6
8504; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm6[0,1,2,2,4,5,6,6]
8505; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm7[1,1,2,3,5,5,6,7]
8506; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7]
8507; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
8508; AVX2-FP-NEXT:    vmovdqa 96(%rdi), %ymm0
8509; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8510; AVX2-FP-NEXT:    vmovdqa 96(%rsi), %ymm1
8511; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8512; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
8513; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8514; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7]
8515; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero
8516; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7]
8517; AVX2-FP-NEXT:    vpbroadcastd 116(%r9), %ymm15
8518; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7]
8519; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8520; AVX2-FP-NEXT:    vpbroadcastd 128(%rcx), %xmm14
8521; AVX2-FP-NEXT:    vpbroadcastd 128(%rdx), %xmm15
8522; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
8523; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
8524; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1]
8525; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm14[2,3],ymm8[4,5,6,7]
8526; AVX2-FP-NEXT:    vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload
8527; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7]
8528; AVX2-FP-NEXT:    vmovdqa 128(%r9), %xmm0
8529; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8530; AVX2-FP-NEXT:    vpbroadcastd %xmm0, %ymm9
8531; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7]
8532; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8533; AVX2-FP-NEXT:    vmovdqa 128(%rdx), %ymm9
8534; AVX2-FP-NEXT:    vmovdqa 128(%rcx), %ymm8
8535; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm8[0,1,2,2,4,5,6,6]
8536; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm9[1,1,2,3,5,5,6,7]
8537; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7]
8538; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
8539; AVX2-FP-NEXT:    vmovdqa 128(%rdi), %ymm0
8540; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8541; AVX2-FP-NEXT:    vmovdqa 128(%rsi), %ymm1
8542; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8543; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
8544; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8545; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7]
8546; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero
8547; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7]
8548; AVX2-FP-NEXT:    vpbroadcastd 148(%r9), %ymm15
8549; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7]
8550; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8551; AVX2-FP-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
8552; AVX2-FP-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
8553; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
8554; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
8555; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1]
8556; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm14[2,3],ymm10[4,5,6,7]
8557; AVX2-FP-NEXT:    vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 16-byte Folded Reload
8558; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7]
8559; AVX2-FP-NEXT:    vmovdqa 160(%r9), %xmm0
8560; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8561; AVX2-FP-NEXT:    vpbroadcastd %xmm0, %ymm11
8562; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7]
8563; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8564; AVX2-FP-NEXT:    vmovdqa 160(%rdx), %ymm11
8565; AVX2-FP-NEXT:    vmovdqa 160(%rcx), %ymm10
8566; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm10[0,1,2,2,4,5,6,6]
8567; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm11[1,1,2,3,5,5,6,7]
8568; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7]
8569; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
8570; AVX2-FP-NEXT:    vmovdqa 160(%rdi), %ymm0
8571; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8572; AVX2-FP-NEXT:    vmovdqa 160(%rsi), %ymm1
8573; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8574; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
8575; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8576; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7]
8577; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero
8578; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7]
8579; AVX2-FP-NEXT:    vpbroadcastd 180(%r9), %ymm15
8580; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7]
8581; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8582; AVX2-FP-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
8583; AVX2-FP-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
8584; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
8585; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
8586; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1]
8587; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7]
8588; AVX2-FP-NEXT:    vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload
8589; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7]
8590; AVX2-FP-NEXT:    vmovdqa 192(%r9), %xmm0
8591; AVX2-FP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8592; AVX2-FP-NEXT:    vpbroadcastd %xmm0, %ymm13
8593; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7]
8594; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8595; AVX2-FP-NEXT:    vmovdqa 192(%rdx), %ymm13
8596; AVX2-FP-NEXT:    vmovdqa 192(%rcx), %ymm12
8597; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm14 = ymm12[0,1,2,2,4,5,6,6]
8598; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm13[1,1,2,3,5,5,6,7]
8599; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4],ymm14[5],ymm15[6],ymm14[7]
8600; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
8601; AVX2-FP-NEXT:    vmovdqa 192(%rdi), %ymm0
8602; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8603; AVX2-FP-NEXT:    vmovdqa 192(%rsi), %ymm1
8604; AVX2-FP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8605; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
8606; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8607; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7]
8608; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero
8609; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7]
8610; AVX2-FP-NEXT:    vpbroadcastd 212(%r9), %ymm15
8611; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7]
8612; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8613; AVX2-FP-NEXT:    vbroadcastss 224(%rcx), %xmm14
8614; AVX2-FP-NEXT:    vbroadcastss 224(%rdx), %xmm15
8615; AVX2-FP-NEXT:    vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
8616; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8617; AVX2-FP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload
8618; AVX2-FP-NEXT:    # xmm15 = xmm0[0],mem[0],xmm0[1],mem[1]
8619; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1]
8620; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7]
8621; AVX2-FP-NEXT:    vbroadcastsd (%rsp), %ymm15 # 16-byte Folded Reload
8622; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7]
8623; AVX2-FP-NEXT:    vmovaps 224(%r9), %xmm0
8624; AVX2-FP-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8625; AVX2-FP-NEXT:    vbroadcastss %xmm0, %ymm15
8626; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
8627; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8628; AVX2-FP-NEXT:    vmovdqa 224(%rdx), %ymm14
8629; AVX2-FP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8630; AVX2-FP-NEXT:    vmovdqa 224(%rcx), %ymm0
8631; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8632; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6]
8633; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm14[1,1,2,3,5,5,6,7]
8634; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2],ymm0[3],ymm15[4],ymm0[5],ymm15[6],ymm0[7]
8635; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,1,2,3]
8636; AVX2-FP-NEXT:    vmovdqa 224(%rdi), %ymm14
8637; AVX2-FP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8638; AVX2-FP-NEXT:    vmovdqa 224(%rsi), %ymm0
8639; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8640; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm14 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7]
8641; AVX2-FP-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8642; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7]
8643; AVX2-FP-NEXT:    vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero
8644; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7]
8645; AVX2-FP-NEXT:    vpbroadcastd 244(%r9), %ymm15
8646; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7]
8647; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8648; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8649; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
8650; AVX2-FP-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
8651; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
8652; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
8653; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm14, %ymm0
8654; AVX2-FP-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
8655; AVX2-FP-NEXT:    # xmm15 = mem[2,2,3,3]
8656; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1]
8657; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5],ymm15[6,7]
8658; AVX2-FP-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
8659; AVX2-FP-NEXT:    # xmm15 = mem[2,2,3,3]
8660; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1]
8661; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7]
8662; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8663; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8664; AVX2-FP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8665; AVX2-FP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
8666; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
8667; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8668; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm15 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
8669; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
8670; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,2,3]
8671; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7]
8672; AVX2-FP-NEXT:    vmovaps (%r8), %ymm15
8673; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7]
8674; AVX2-FP-NEXT:    vbroadcastss 16(%r9), %ymm14
8675; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
8676; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8677; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
8678; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
8679; AVX2-FP-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8680; AVX2-FP-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
8681; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm1 = ymm15[2,1,3,3,6,5,7,7]
8682; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
8683; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
8684; AVX2-FP-NEXT:    vpermilps {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7]
8685; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
8686; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
8687; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8688; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8689; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
8690; AVX2-FP-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
8691; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
8692; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8693; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
8694; AVX2-FP-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8695; AVX2-FP-NEXT:    # xmm1 = mem[2,2,3,3]
8696; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
8697; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
8698; AVX2-FP-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8699; AVX2-FP-NEXT:    # xmm1 = mem[2,2,3,3]
8700; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
8701; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
8702; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8703; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8704; AVX2-FP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8705; AVX2-FP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
8706; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
8707; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
8708; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
8709; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
8710; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
8711; AVX2-FP-NEXT:    vmovdqa 32(%r8), %ymm1
8712; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
8713; AVX2-FP-NEXT:    vpbroadcastd 48(%r9), %ymm14
8714; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
8715; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8716; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
8717; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
8718; AVX2-FP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8719; AVX2-FP-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
8720; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
8721; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
8722; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
8723; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7]
8724; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
8725; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
8726; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8727; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8728; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
8729; AVX2-FP-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
8730; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
8731; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8732; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
8733; AVX2-FP-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8734; AVX2-FP-NEXT:    # xmm1 = mem[2,2,3,3]
8735; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
8736; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
8737; AVX2-FP-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8738; AVX2-FP-NEXT:    # xmm1 = mem[2,2,3,3]
8739; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
8740; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
8741; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8742; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8743; AVX2-FP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8744; AVX2-FP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
8745; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5]
8746; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
8747; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
8748; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
8749; AVX2-FP-NEXT:    vmovdqa 64(%r8), %ymm1
8750; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
8751; AVX2-FP-NEXT:    vpbroadcastd 80(%r9), %ymm2
8752; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
8753; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8754; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7]
8755; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
8756; AVX2-FP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8757; AVX2-FP-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
8758; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
8759; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
8760; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
8761; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7]
8762; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
8763; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
8764; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8765; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8766; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
8767; AVX2-FP-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
8768; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
8769; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8770; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
8771; AVX2-FP-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8772; AVX2-FP-NEXT:    # xmm1 = mem[2,2,3,3]
8773; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
8774; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
8775; AVX2-FP-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8776; AVX2-FP-NEXT:    # xmm1 = mem[2,2,3,3]
8777; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
8778; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
8779; AVX2-FP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8780; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8781; AVX2-FP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8782; AVX2-FP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
8783; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm1 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
8784; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
8785; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
8786; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
8787; AVX2-FP-NEXT:    vmovdqa 96(%r8), %ymm1
8788; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
8789; AVX2-FP-NEXT:    vpbroadcastd 112(%r9), %ymm2
8790; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
8791; AVX2-FP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8792; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7]
8793; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
8794; AVX2-FP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8795; AVX2-FP-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
8796; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
8797; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
8798; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
8799; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7]
8800; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
8801; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
8802; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8803; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
8804; AVX2-FP-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
8805; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
8806; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8807; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
8808; AVX2-FP-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8809; AVX2-FP-NEXT:    # xmm1 = mem[2,2,3,3]
8810; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
8811; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
8812; AVX2-FP-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8813; AVX2-FP-NEXT:    # xmm1 = mem[2,2,3,3]
8814; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
8815; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm7 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
8816; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8817; AVX2-FP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8818; AVX2-FP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
8819; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm1 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5]
8820; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
8821; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
8822; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
8823; AVX2-FP-NEXT:    vmovdqa 128(%r8), %ymm1
8824; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
8825; AVX2-FP-NEXT:    vpbroadcastd 144(%r9), %ymm2
8826; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
8827; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7]
8828; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
8829; AVX2-FP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8830; AVX2-FP-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
8831; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
8832; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
8833; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
8834; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7]
8835; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
8836; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
8837; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8838; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
8839; AVX2-FP-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
8840; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
8841; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8842; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
8843; AVX2-FP-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8844; AVX2-FP-NEXT:    # xmm1 = mem[2,2,3,3]
8845; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
8846; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
8847; AVX2-FP-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8848; AVX2-FP-NEXT:    # xmm1 = mem[2,2,3,3]
8849; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
8850; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7]
8851; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8852; AVX2-FP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8853; AVX2-FP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
8854; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm1 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5]
8855; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
8856; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
8857; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
8858; AVX2-FP-NEXT:    vmovdqa 160(%r8), %ymm14
8859; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7]
8860; AVX2-FP-NEXT:    vpbroadcastd 176(%r9), %ymm1
8861; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
8862; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7]
8863; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
8864; AVX2-FP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8865; AVX2-FP-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
8866; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm10 = ymm14[2,1,3,3,6,5,7,7]
8867; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3]
8868; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3,4,5],ymm10[6,7]
8869; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm10 = mem[0,2,2,3,4,6,6,7]
8870; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3]
8871; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm0[0],ymm10[1],ymm0[2,3,4,5,6],ymm10[7]
8872; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8873; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
8874; AVX2-FP-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
8875; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
8876; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
8877; AVX2-FP-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
8878; AVX2-FP-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
8879; AVX2-FP-NEXT:    # xmm11 = mem[2,2,3,3]
8880; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1]
8881; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5],ymm11[6,7]
8882; AVX2-FP-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
8883; AVX2-FP-NEXT:    # xmm11 = mem[2,2,3,3]
8884; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1]
8885; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm11 = ymm0[0],ymm11[1],ymm0[2,3,4,5,6],ymm11[7]
8886; AVX2-FP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8887; AVX2-FP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
8888; AVX2-FP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
8889; AVX2-FP-NEXT:    vpunpckldq {{.*#+}} ymm14 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5]
8890; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
8891; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
8892; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7]
8893; AVX2-FP-NEXT:    vmovdqa 192(%r8), %ymm14
8894; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7]
8895; AVX2-FP-NEXT:    vpbroadcastd 208(%r9), %ymm15
8896; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7]
8897; AVX2-FP-NEXT:    vpunpckhdq {{.*#+}} ymm12 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7]
8898; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,3,6,7,6,7]
8899; AVX2-FP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
8900; AVX2-FP-NEXT:    # ymm12 = mem[2,3],ymm12[2,3]
8901; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm13 = ymm14[2,1,3,3,6,5,7,7]
8902; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3]
8903; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7]
8904; AVX2-FP-NEXT:    vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7]
8905; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3]
8906; AVX2-FP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7]
8907; AVX2-FP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8908; AVX2-FP-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm13 # 16-byte Folded Reload
8909; AVX2-FP-NEXT:    # xmm13 = xmm4[2],mem[2],xmm4[3],mem[3]
8910; AVX2-FP-NEXT:    vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3]
8911; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
8912; AVX2-FP-NEXT:    vinsertf128 $1, %xmm13, %ymm4, %ymm13
8913; AVX2-FP-NEXT:    vpermilps $250, (%rsp), %xmm14 # 16-byte Folded Reload
8914; AVX2-FP-NEXT:    # xmm14 = mem[2,2,3,3]
8915; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1]
8916; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7]
8917; AVX2-FP-NEXT:    vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
8918; AVX2-FP-NEXT:    # xmm14 = mem[2,2,3,3]
8919; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1]
8920; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6],ymm14[7]
8921; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8922; AVX2-FP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
8923; AVX2-FP-NEXT:    # ymm14 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
8924; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
8925; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8926; AVX2-FP-NEXT:    vunpcklps {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
8927; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
8928; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3]
8929; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7]
8930; AVX2-FP-NEXT:    vmovaps 224(%r8), %ymm15
8931; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7]
8932; AVX2-FP-NEXT:    vbroadcastss 240(%r9), %ymm4
8933; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7]
8934; AVX2-FP-NEXT:    vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
8935; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7]
8936; AVX2-FP-NEXT:    vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
8937; AVX2-FP-NEXT:    # ymm14 = mem[2,3],ymm14[2,3]
8938; AVX2-FP-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7]
8939; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3]
8940; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7]
8941; AVX2-FP-NEXT:    vpermilps {{.*#+}} ymm15 = mem[0,2,2,3,4,6,6,7]
8942; AVX2-FP-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3]
8943; AVX2-FP-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7]
8944; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
8945; AVX2-FP-NEXT:    vmovaps %ymm14, 1504(%rax)
8946; AVX2-FP-NEXT:    vmovaps %ymm4, 1440(%rax)
8947; AVX2-FP-NEXT:    vmovaps %ymm13, 1408(%rax)
8948; AVX2-FP-NEXT:    vmovdqa %ymm12, 1312(%rax)
8949; AVX2-FP-NEXT:    vmovdqa %ymm2, 1248(%rax)
8950; AVX2-FP-NEXT:    vmovaps %ymm11, 1216(%rax)
8951; AVX2-FP-NEXT:    vmovdqa %ymm10, 1120(%rax)
8952; AVX2-FP-NEXT:    vmovdqa %ymm3, 1056(%rax)
8953; AVX2-FP-NEXT:    vmovaps %ymm9, 1024(%rax)
8954; AVX2-FP-NEXT:    vmovdqa %ymm8, 928(%rax)
8955; AVX2-FP-NEXT:    vmovdqa %ymm5, 864(%rax)
8956; AVX2-FP-NEXT:    vmovaps %ymm7, 832(%rax)
8957; AVX2-FP-NEXT:    vmovdqa %ymm6, 736(%rax)
8958; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8959; AVX2-FP-NEXT:    vmovaps %ymm0, 672(%rax)
8960; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8961; AVX2-FP-NEXT:    vmovaps %ymm0, 640(%rax)
8962; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8963; AVX2-FP-NEXT:    vmovaps %ymm0, 544(%rax)
8964; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8965; AVX2-FP-NEXT:    vmovaps %ymm0, 480(%rax)
8966; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8967; AVX2-FP-NEXT:    vmovaps %ymm0, 448(%rax)
8968; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8969; AVX2-FP-NEXT:    vmovaps %ymm0, 352(%rax)
8970; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8971; AVX2-FP-NEXT:    vmovaps %ymm0, 288(%rax)
8972; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8973; AVX2-FP-NEXT:    vmovaps %ymm0, 256(%rax)
8974; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8975; AVX2-FP-NEXT:    vmovaps %ymm0, 160(%rax)
8976; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8977; AVX2-FP-NEXT:    vmovaps %ymm0, 96(%rax)
8978; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8979; AVX2-FP-NEXT:    vmovaps %ymm0, 64(%rax)
8980; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8981; AVX2-FP-NEXT:    vmovaps %ymm0, 1472(%rax)
8982; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8983; AVX2-FP-NEXT:    vmovaps %ymm0, 1344(%rax)
8984; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8985; AVX2-FP-NEXT:    vmovaps %ymm0, 1280(%rax)
8986; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8987; AVX2-FP-NEXT:    vmovaps %ymm0, 1152(%rax)
8988; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8989; AVX2-FP-NEXT:    vmovaps %ymm0, 1088(%rax)
8990; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8991; AVX2-FP-NEXT:    vmovaps %ymm0, 960(%rax)
8992; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8993; AVX2-FP-NEXT:    vmovaps %ymm0, 896(%rax)
8994; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8995; AVX2-FP-NEXT:    vmovaps %ymm0, 768(%rax)
8996; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8997; AVX2-FP-NEXT:    vmovaps %ymm0, 704(%rax)
8998; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
8999; AVX2-FP-NEXT:    vmovaps %ymm0, 576(%rax)
9000; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9001; AVX2-FP-NEXT:    vmovaps %ymm0, 512(%rax)
9002; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9003; AVX2-FP-NEXT:    vmovaps %ymm0, 384(%rax)
9004; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9005; AVX2-FP-NEXT:    vmovaps %ymm0, 320(%rax)
9006; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9007; AVX2-FP-NEXT:    vmovaps %ymm0, 192(%rax)
9008; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9009; AVX2-FP-NEXT:    vmovaps %ymm0, 128(%rax)
9010; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9011; AVX2-FP-NEXT:    vmovaps %ymm0, (%rax)
9012; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9013; AVX2-FP-NEXT:    vmovaps %ymm0, 1376(%rax)
9014; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9015; AVX2-FP-NEXT:    vmovaps %ymm0, 1184(%rax)
9016; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9017; AVX2-FP-NEXT:    vmovaps %ymm0, 992(%rax)
9018; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9019; AVX2-FP-NEXT:    vmovaps %ymm0, 800(%rax)
9020; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9021; AVX2-FP-NEXT:    vmovaps %ymm0, 608(%rax)
9022; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9023; AVX2-FP-NEXT:    vmovaps %ymm0, 416(%rax)
9024; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9025; AVX2-FP-NEXT:    vmovaps %ymm0, 224(%rax)
9026; AVX2-FP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9027; AVX2-FP-NEXT:    vmovaps %ymm0, 32(%rax)
9028; AVX2-FP-NEXT:    addq $2504, %rsp # imm = 0x9C8
9029; AVX2-FP-NEXT:    vzeroupper
9030; AVX2-FP-NEXT:    retq
9031;
9032; AVX2-FCP-LABEL: store_i32_stride6_vf64:
9033; AVX2-FCP:       # %bb.0:
9034; AVX2-FCP-NEXT:    subq $2376, %rsp # imm = 0x948
9035; AVX2-FCP-NEXT:    vmovdqa (%rsi), %xmm9
9036; AVX2-FCP-NEXT:    vmovdqa 32(%rsi), %xmm1
9037; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9038; AVX2-FCP-NEXT:    vmovdqa (%rdi), %xmm7
9039; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %xmm10
9040; AVX2-FCP-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9041; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm9[2],xmm7[3],xmm9[3]
9042; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9043; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
9044; AVX2-FCP-NEXT:    vmovdqa (%rcx), %xmm2
9045; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9046; AVX2-FCP-NEXT:    vmovdqa 32(%rcx), %xmm6
9047; AVX2-FCP-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9048; AVX2-FCP-NEXT:    vmovdqa 64(%rcx), %xmm5
9049; AVX2-FCP-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9050; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,2,2,3]
9051; AVX2-FCP-NEXT:    vmovdqa (%rdx), %xmm2
9052; AVX2-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9053; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %xmm8
9054; AVX2-FCP-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9055; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[1,2,2,3]
9056; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
9057; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
9058; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7]
9059; AVX2-FCP-NEXT:    vmovdqa (%r8), %xmm2
9060; AVX2-FCP-NEXT:    vmovdqa 32(%r8), %xmm11
9061; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
9062; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7]
9063; AVX2-FCP-NEXT:    vpbroadcastd 4(%r9), %ymm4
9064; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7]
9065; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9066; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
9067; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9068; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm6[1,2,2,3]
9069; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3]
9070; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
9071; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm4
9072; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
9073; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
9074; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm11[0],zero,xmm11[1],zero
9075; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7]
9076; AVX2-FCP-NEXT:    vpbroadcastd 36(%r9), %ymm4
9077; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7]
9078; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9079; AVX2-FCP-NEXT:    vmovdqa 64(%rdx), %xmm0
9080; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9081; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm5[1,2,2,3]
9082; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,2,2,3]
9083; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
9084; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
9085; AVX2-FCP-NEXT:    vmovdqa 64(%rsi), %xmm15
9086; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %xmm13
9087; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm13[2],xmm15[2],xmm13[3],xmm15[3]
9088; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9089; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm4
9090; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
9091; AVX2-FCP-NEXT:    vmovdqa 64(%r8), %xmm4
9092; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
9093; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7]
9094; AVX2-FCP-NEXT:    vpbroadcastd 68(%r9), %ymm5
9095; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7]
9096; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9097; AVX2-FCP-NEXT:    vmovdqa 96(%rcx), %xmm0
9098; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9099; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3]
9100; AVX2-FCP-NEXT:    vmovdqa 96(%rdx), %xmm0
9101; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9102; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3]
9103; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
9104; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
9105; AVX2-FCP-NEXT:    vmovdqa 96(%rsi), %xmm10
9106; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %xmm12
9107; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm12[2],xmm10[2],xmm12[3],xmm10[3]
9108; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9109; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm5
9110; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7]
9111; AVX2-FCP-NEXT:    vmovdqa 96(%r8), %xmm8
9112; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm8[0],zero,xmm8[1],zero
9113; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7]
9114; AVX2-FCP-NEXT:    vpbroadcastd 100(%r9), %ymm5
9115; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7]
9116; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9117; AVX2-FCP-NEXT:    vmovdqa 128(%rcx), %xmm0
9118; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9119; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3]
9120; AVX2-FCP-NEXT:    vmovdqa 128(%rdx), %xmm0
9121; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9122; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3]
9123; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
9124; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
9125; AVX2-FCP-NEXT:    vmovdqa 128(%rsi), %xmm0
9126; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9127; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %xmm1
9128; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9129; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9130; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9131; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm5
9132; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7]
9133; AVX2-FCP-NEXT:    vmovdqa 128(%r8), %xmm5
9134; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero
9135; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5,6,7]
9136; AVX2-FCP-NEXT:    vpbroadcastd 132(%r9), %ymm6
9137; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7]
9138; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9139; AVX2-FCP-NEXT:    vmovdqa 160(%rcx), %xmm0
9140; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9141; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3]
9142; AVX2-FCP-NEXT:    vmovdqa 160(%rdx), %xmm0
9143; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9144; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[1,2,2,3]
9145; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
9146; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
9147; AVX2-FCP-NEXT:    vmovdqa 160(%rsi), %xmm0
9148; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9149; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %xmm1
9150; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9151; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9152; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9153; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm6
9154; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7]
9155; AVX2-FCP-NEXT:    vmovdqa 160(%r8), %xmm6
9156; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm14 = xmm6[0],zero,xmm6[1],zero
9157; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7]
9158; AVX2-FCP-NEXT:    vpbroadcastd 164(%r9), %ymm14
9159; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7]
9160; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9161; AVX2-FCP-NEXT:    vmovdqa 192(%rcx), %xmm0
9162; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9163; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3]
9164; AVX2-FCP-NEXT:    vmovdqa 192(%rdx), %xmm0
9165; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9166; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[1,2,2,3]
9167; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1]
9168; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
9169; AVX2-FCP-NEXT:    vmovdqa 192(%rsi), %xmm0
9170; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9171; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %xmm1
9172; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9173; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9174; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9175; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm14
9176; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5],ymm3[6,7]
9177; AVX2-FCP-NEXT:    vmovdqa 192(%r8), %xmm0
9178; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9179; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero
9180; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7]
9181; AVX2-FCP-NEXT:    vpbroadcastd 196(%r9), %ymm14
9182; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7]
9183; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9184; AVX2-FCP-NEXT:    vmovdqa 224(%rcx), %xmm3
9185; AVX2-FCP-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9186; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3]
9187; AVX2-FCP-NEXT:    vmovdqa 224(%rdx), %xmm0
9188; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9189; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[1,2,2,3]
9190; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1]
9191; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
9192; AVX2-FCP-NEXT:    vmovdqa 224(%rsi), %xmm0
9193; AVX2-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9194; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %xmm1
9195; AVX2-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9196; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9197; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9198; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm14
9199; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5],ymm3[6,7]
9200; AVX2-FCP-NEXT:    vmovdqa 224(%r8), %xmm0
9201; AVX2-FCP-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
9202; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero
9203; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7]
9204; AVX2-FCP-NEXT:    vpbroadcastd 228(%r9), %ymm14
9205; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7]
9206; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9207; AVX2-FCP-NEXT:    vpbroadcastd (%rcx), %xmm3
9208; AVX2-FCP-NEXT:    vpbroadcastd (%rdx), %xmm14
9209; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1]
9210; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
9211; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
9212; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7]
9213; AVX2-FCP-NEXT:    vpbroadcastq %xmm2, %ymm0
9214; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
9215; AVX2-FCP-NEXT:    vpbroadcastd (%r9), %ymm1
9216; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
9217; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9218; AVX2-FCP-NEXT:    vmovdqa (%rdx), %ymm14
9219; AVX2-FCP-NEXT:    vmovdqa (%rcx), %ymm0
9220; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9221; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6]
9222; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm14[1,1,2,3,5,5,6,7]
9223; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
9224; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
9225; AVX2-FCP-NEXT:    vmovdqa (%rdi), %ymm1
9226; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9227; AVX2-FCP-NEXT:    vmovdqa (%rsi), %ymm2
9228; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9229; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
9230; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9231; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
9232; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
9233; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
9234; AVX2-FCP-NEXT:    vpbroadcastd 20(%r9), %ymm1
9235; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
9236; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9237; AVX2-FCP-NEXT:    vpbroadcastd 32(%rcx), %xmm0
9238; AVX2-FCP-NEXT:    vpbroadcastd 32(%rdx), %xmm1
9239; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9240; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9241; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
9242; AVX2-FCP-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
9243; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
9244; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
9245; AVX2-FCP-NEXT:    vpbroadcastq %xmm11, %ymm1
9246; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
9247; AVX2-FCP-NEXT:    vpbroadcastd 32(%r9), %ymm1
9248; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
9249; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9250; AVX2-FCP-NEXT:    vmovdqa 32(%rdx), %ymm1
9251; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9252; AVX2-FCP-NEXT:    vmovdqa 32(%rcx), %ymm0
9253; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9254; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6]
9255; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,2,3,5,5,6,7]
9256; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
9257; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
9258; AVX2-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
9259; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9260; AVX2-FCP-NEXT:    vmovdqa 32(%rsi), %ymm2
9261; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9262; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
9263; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9264; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
9265; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
9266; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
9267; AVX2-FCP-NEXT:    vpbroadcastd 52(%r9), %ymm1
9268; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
9269; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9270; AVX2-FCP-NEXT:    vpbroadcastd 64(%rcx), %xmm0
9271; AVX2-FCP-NEXT:    vpbroadcastd 64(%rdx), %xmm1
9272; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9273; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm15[0],xmm13[1],xmm15[1]
9274; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
9275; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
9276; AVX2-FCP-NEXT:    vpbroadcastq %xmm4, %ymm1
9277; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
9278; AVX2-FCP-NEXT:    vpbroadcastd 64(%r9), %ymm1
9279; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
9280; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9281; AVX2-FCP-NEXT:    vmovdqa 64(%rdx), %ymm1
9282; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9283; AVX2-FCP-NEXT:    vmovdqa 64(%rcx), %ymm0
9284; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9285; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6]
9286; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,2,3,5,5,6,7]
9287; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
9288; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
9289; AVX2-FCP-NEXT:    vmovdqa 64(%rdi), %ymm1
9290; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9291; AVX2-FCP-NEXT:    vmovdqa 64(%rsi), %ymm2
9292; AVX2-FCP-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9293; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
9294; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9295; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
9296; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
9297; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
9298; AVX2-FCP-NEXT:    vpbroadcastd 84(%r9), %ymm1
9299; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
9300; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9301; AVX2-FCP-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9302; AVX2-FCP-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9303; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9304; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
9305; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
9306; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
9307; AVX2-FCP-NEXT:    vpbroadcastq %xmm8, %ymm1
9308; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
9309; AVX2-FCP-NEXT:    vpbroadcastd 96(%r9), %ymm1
9310; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
9311; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9312; AVX2-FCP-NEXT:    vmovdqa 96(%rdx), %ymm1
9313; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9314; AVX2-FCP-NEXT:    vmovdqa 96(%rcx), %ymm2
9315; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6]
9316; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7]
9317; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7]
9318; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
9319; AVX2-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
9320; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9321; AVX2-FCP-NEXT:    vmovdqa 96(%rsi), %ymm3
9322; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9323; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
9324; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9325; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
9326; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero
9327; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7]
9328; AVX2-FCP-NEXT:    vpbroadcastd 116(%r9), %ymm4
9329; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7]
9330; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9331; AVX2-FCP-NEXT:    vpbroadcastd 128(%rcx), %xmm0
9332; AVX2-FCP-NEXT:    vpbroadcastd 128(%rdx), %xmm4
9333; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
9334; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9335; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
9336; AVX2-FCP-NEXT:    # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1]
9337; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
9338; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7]
9339; AVX2-FCP-NEXT:    vpbroadcastq %xmm5, %ymm4
9340; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
9341; AVX2-FCP-NEXT:    vpbroadcastd 128(%r9), %ymm4
9342; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
9343; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9344; AVX2-FCP-NEXT:    vmovdqa 128(%rdx), %ymm12
9345; AVX2-FCP-NEXT:    vmovdqa 128(%rcx), %ymm9
9346; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm9[0,1,2,2,4,5,6,6]
9347; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm12[1,1,2,3,5,5,6,7]
9348; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7]
9349; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
9350; AVX2-FCP-NEXT:    vmovdqa 128(%rdi), %ymm1
9351; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9352; AVX2-FCP-NEXT:    vmovdqa 128(%rsi), %ymm3
9353; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9354; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
9355; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9356; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
9357; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero
9358; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7]
9359; AVX2-FCP-NEXT:    vpbroadcastd 148(%r9), %ymm4
9360; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7]
9361; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9362; AVX2-FCP-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9363; AVX2-FCP-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9364; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
9365; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9366; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
9367; AVX2-FCP-NEXT:    # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1]
9368; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
9369; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7]
9370; AVX2-FCP-NEXT:    vpbroadcastq %xmm6, %ymm4
9371; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
9372; AVX2-FCP-NEXT:    vpbroadcastd 160(%r9), %ymm4
9373; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
9374; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9375; AVX2-FCP-NEXT:    vmovdqa 160(%rdx), %ymm10
9376; AVX2-FCP-NEXT:    vmovdqa 160(%rcx), %ymm7
9377; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm7[0,1,2,2,4,5,6,6]
9378; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm10[1,1,2,3,5,5,6,7]
9379; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7]
9380; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
9381; AVX2-FCP-NEXT:    vmovdqa 160(%rdi), %ymm1
9382; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9383; AVX2-FCP-NEXT:    vmovdqa 160(%rsi), %ymm3
9384; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9385; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
9386; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9387; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
9388; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero
9389; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7]
9390; AVX2-FCP-NEXT:    vpbroadcastd 180(%r9), %ymm4
9391; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7]
9392; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9393; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9394; AVX2-FCP-NEXT:    vpbroadcastd %xmm11, %xmm0
9395; AVX2-FCP-NEXT:    vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9396; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
9397; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9398; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
9399; AVX2-FCP-NEXT:    # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1]
9400; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
9401; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7]
9402; AVX2-FCP-NEXT:    vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload
9403; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
9404; AVX2-FCP-NEXT:    vpbroadcastd 192(%r9), %ymm4
9405; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
9406; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9407; AVX2-FCP-NEXT:    vmovdqa 192(%rdx), %ymm1
9408; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9409; AVX2-FCP-NEXT:    vmovdqa 192(%rcx), %ymm0
9410; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9411; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6]
9412; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7]
9413; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7]
9414; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
9415; AVX2-FCP-NEXT:    vmovdqa 192(%rdi), %ymm1
9416; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9417; AVX2-FCP-NEXT:    vmovdqa 192(%rsi), %ymm3
9418; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9419; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
9420; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9421; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
9422; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero
9423; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7]
9424; AVX2-FCP-NEXT:    vpbroadcastd 212(%r9), %ymm4
9425; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7]
9426; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9427; AVX2-FCP-NEXT:    vbroadcastss 224(%rcx), %xmm0
9428; AVX2-FCP-NEXT:    vbroadcastss 224(%rdx), %xmm4
9429; AVX2-FCP-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
9430; AVX2-FCP-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9431; AVX2-FCP-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
9432; AVX2-FCP-NEXT:    # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1]
9433; AVX2-FCP-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1]
9434; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7]
9435; AVX2-FCP-NEXT:    vbroadcastsd (%rsp), %ymm4 # 16-byte Folded Reload
9436; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
9437; AVX2-FCP-NEXT:    vbroadcastss 224(%r9), %ymm4
9438; AVX2-FCP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
9439; AVX2-FCP-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9440; AVX2-FCP-NEXT:    vmovdqa 224(%rdx), %ymm1
9441; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9442; AVX2-FCP-NEXT:    vmovdqa 224(%rcx), %ymm8
9443; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm8[0,1,2,2,4,5,6,6]
9444; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7]
9445; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7]
9446; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
9447; AVX2-FCP-NEXT:    vmovdqa 224(%rdi), %ymm1
9448; AVX2-FCP-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9449; AVX2-FCP-NEXT:    vmovdqa 224(%rsi), %ymm3
9450; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9451; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
9452; AVX2-FCP-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
9453; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
9454; AVX2-FCP-NEXT:    vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero
9455; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7]
9456; AVX2-FCP-NEXT:    vpbroadcastd 244(%r9), %ymm4
9457; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7]
9458; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9459; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9460; AVX2-FCP-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9461; AVX2-FCP-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9462; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
9463; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
9464; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm4
9465; AVX2-FCP-NEXT:    vmovdqa (%r8), %ymm0
9466; AVX2-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [2,2,3,3,2,2,3,3]
9467; AVX2-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
9468; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm1, %ymm15
9469; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm4[2,3,4,5],ymm15[6,7]
9470; AVX2-FCP-NEXT:    vmovdqa (%r9), %ymm15
9471; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm1, %ymm3
9472; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4,5,6],ymm3[7]
9473; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9474; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9475; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
9476; AVX2-FCP-NEXT:    # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[4],mem[4],ymm3[5],mem[5]
9477; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
9478; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm4 = ymm14[0],ymm5[0],ymm14[1],ymm5[1],ymm14[4],ymm5[4],ymm14[5],ymm5[5]
9479; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
9480; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
9481; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7]
9482; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7]
9483; AVX2-FCP-NEXT:    vpbroadcastd 16(%r9), %ymm4
9484; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
9485; AVX2-FCP-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9486; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm3 = ymm14[2],ymm5[2],ymm14[3],ymm5[3],ymm14[6],ymm5[6],ymm14[7],ymm5[7]
9487; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7]
9488; AVX2-FCP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
9489; AVX2-FCP-NEXT:    # ymm3 = mem[2,3],ymm3[2,3]
9490; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm6 = [6,5,3,3,6,5,7,7]
9491; AVX2-FCP-NEXT:    vpermd %ymm0, %ymm6, %ymm0
9492; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5],ymm0[6,7]
9493; AVX2-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [4,6,2,3,4,6,6,7]
9494; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm5, %ymm15
9495; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7]
9496; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9497; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9498; AVX2-FCP-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9499; AVX2-FCP-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9500; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
9501; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9502; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm3, %ymm0
9503; AVX2-FCP-NEXT:    vmovdqa 32(%r8), %ymm15
9504; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm1, %ymm13
9505; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7]
9506; AVX2-FCP-NEXT:    vmovdqa 32(%r9), %ymm13
9507; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm1, %ymm14
9508; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4,5,6],ymm14[7]
9509; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9510; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9511; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9512; AVX2-FCP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
9513; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9514; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
9515; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm14 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5]
9516; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
9517; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
9518; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7]
9519; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7]
9520; AVX2-FCP-NEXT:    vpbroadcastd 48(%r9), %ymm14
9521; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
9522; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9523; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7]
9524; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
9525; AVX2-FCP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9526; AVX2-FCP-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
9527; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm6, %ymm14
9528; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7]
9529; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm5, %ymm13
9530; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7]
9531; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9532; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9533; AVX2-FCP-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9534; AVX2-FCP-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9535; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
9536; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9537; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm3, %ymm0
9538; AVX2-FCP-NEXT:    vmovdqa 64(%r8), %ymm13
9539; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm1, %ymm14
9540; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7]
9541; AVX2-FCP-NEXT:    vmovdqa 64(%r9), %ymm14
9542; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm1, %ymm15
9543; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7]
9544; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9545; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9546; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9547; AVX2-FCP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
9548; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9549; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
9550; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm15 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5]
9551; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2]
9552; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
9553; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7]
9554; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7]
9555; AVX2-FCP-NEXT:    vpbroadcastd 80(%r9), %ymm15
9556; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7]
9557; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9558; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7]
9559; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
9560; AVX2-FCP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9561; AVX2-FCP-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
9562; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm6, %ymm13
9563; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7]
9564; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm5, %ymm13
9565; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7]
9566; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9567; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9568; AVX2-FCP-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9569; AVX2-FCP-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9570; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
9571; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9572; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm3, %ymm0
9573; AVX2-FCP-NEXT:    vmovdqa 96(%r8), %ymm13
9574; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm1, %ymm14
9575; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7]
9576; AVX2-FCP-NEXT:    vmovdqa 96(%r9), %ymm14
9577; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm1, %ymm15
9578; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7]
9579; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9580; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9581; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9582; AVX2-FCP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
9583; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9584; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm15 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
9585; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2]
9586; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
9587; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7]
9588; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7]
9589; AVX2-FCP-NEXT:    vpbroadcastd 112(%r9), %ymm15
9590; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7]
9591; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9592; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
9593; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
9594; AVX2-FCP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9595; AVX2-FCP-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
9596; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm6, %ymm2
9597; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7]
9598; AVX2-FCP-NEXT:    vpermd %ymm14, %ymm5, %ymm2
9599; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7]
9600; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9601; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9602; AVX2-FCP-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9603; AVX2-FCP-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9604; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
9605; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9606; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
9607; AVX2-FCP-NEXT:    vmovdqa 128(%r8), %ymm2
9608; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm13
9609; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7]
9610; AVX2-FCP-NEXT:    vmovdqa 128(%r9), %ymm3
9611; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm13
9612; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7]
9613; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9614; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9615; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9616; AVX2-FCP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
9617; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm14 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[4],ymm9[4],ymm12[5],ymm9[5]
9618; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
9619; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
9620; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7]
9621; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
9622; AVX2-FCP-NEXT:    vpbroadcastd 144(%r9), %ymm14
9623; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
9624; AVX2-FCP-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9625; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[6],ymm9[6],ymm12[7],ymm9[7]
9626; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
9627; AVX2-FCP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9628; AVX2-FCP-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
9629; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm6, %ymm2
9630; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7]
9631; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm5, %ymm2
9632; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7]
9633; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9634; AVX2-FCP-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9635; AVX2-FCP-NEXT:    # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9636; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
9637; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9638; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
9639; AVX2-FCP-NEXT:    vmovdqa 160(%r8), %ymm3
9640; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm2
9641; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7]
9642; AVX2-FCP-NEXT:    vmovdqa 160(%r9), %ymm2
9643; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm4
9644; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm0[0],ymm4[1],ymm0[2,3,4,5,6],ymm4[7]
9645; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9646; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9647; AVX2-FCP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
9648; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm4 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[4],ymm7[4],ymm10[5],ymm7[5]
9649; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
9650; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
9651; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7]
9652; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7]
9653; AVX2-FCP-NEXT:    vpbroadcastd 176(%r9), %ymm4
9654; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
9655; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[6],ymm7[6],ymm10[7],ymm7[7]
9656; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
9657; AVX2-FCP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9658; AVX2-FCP-NEXT:    # ymm0 = mem[2,3],ymm0[2,3]
9659; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm6, %ymm3
9660; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7]
9661; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm5, %ymm2
9662; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm7 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7]
9663; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9664; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3]
9665; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
9666; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
9667; AVX2-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
9668; AVX2-FCP-NEXT:    vmovdqa 192(%r8), %ymm2
9669; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm1, %ymm3
9670; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7]
9671; AVX2-FCP-NEXT:    vmovdqa 192(%r9), %ymm3
9672; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm1, %ymm10
9673; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm10 = ymm0[0],ymm10[1],ymm0[2,3,4,5,6],ymm10[7]
9674; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9675; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
9676; AVX2-FCP-NEXT:    # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
9677; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
9678; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
9679; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm15 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5]
9680; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2]
9681; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
9682; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7]
9683; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
9684; AVX2-FCP-NEXT:    vpbroadcastd 208(%r9), %ymm15
9685; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7]
9686; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm15 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7]
9687; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7]
9688; AVX2-FCP-NEXT:    vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
9689; AVX2-FCP-NEXT:    # ymm15 = mem[2,3],ymm15[2,3]
9690; AVX2-FCP-NEXT:    vpermd %ymm2, %ymm6, %ymm2
9691; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm15[2,3,4,5],ymm2[6,7]
9692; AVX2-FCP-NEXT:    vpermd %ymm3, %ymm5, %ymm3
9693; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7]
9694; AVX2-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9695; AVX2-FCP-NEXT:    vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
9696; AVX2-FCP-NEXT:    # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
9697; AVX2-FCP-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
9698; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
9699; AVX2-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm11, %ymm3
9700; AVX2-FCP-NEXT:    vmovdqa 224(%r8), %ymm15
9701; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm1, %ymm13
9702; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm3[2,3,4,5],ymm13[6,7]
9703; AVX2-FCP-NEXT:    vmovdqa 224(%r9), %ymm13
9704; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm1, %ymm1
9705; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4,5,6],ymm1[7]
9706; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9707; AVX2-FCP-NEXT:    vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
9708; AVX2-FCP-NEXT:    # ymm3 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
9709; AVX2-FCP-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9710; AVX2-FCP-NEXT:    vpunpckldq {{.*#+}} ymm11 = ymm0[0],ymm8[0],ymm0[1],ymm8[1],ymm0[4],ymm8[4],ymm0[5],ymm8[5]
9711; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
9712; AVX2-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
9713; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7]
9714; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7]
9715; AVX2-FCP-NEXT:    vpbroadcastd 240(%r9), %ymm11
9716; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5],ymm3[6,7]
9717; AVX2-FCP-NEXT:    vpunpckhdq {{.*#+}} ymm8 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7]
9718; AVX2-FCP-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7]
9719; AVX2-FCP-NEXT:    vperm2i128 $19, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload
9720; AVX2-FCP-NEXT:    # ymm8 = mem[2,3],ymm8[2,3]
9721; AVX2-FCP-NEXT:    vpermd %ymm15, %ymm6, %ymm6
9722; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3,4,5],ymm6[6,7]
9723; AVX2-FCP-NEXT:    vpermd %ymm13, %ymm5, %ymm5
9724; AVX2-FCP-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4,5,6],ymm5[7]
9725; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
9726; AVX2-FCP-NEXT:    vmovdqa %ymm5, 1504(%rax)
9727; AVX2-FCP-NEXT:    vmovdqa %ymm3, 1440(%rax)
9728; AVX2-FCP-NEXT:    vmovdqa %ymm1, 1408(%rax)
9729; AVX2-FCP-NEXT:    vmovdqa %ymm2, 1312(%rax)
9730; AVX2-FCP-NEXT:    vmovdqa %ymm4, 1248(%rax)
9731; AVX2-FCP-NEXT:    vmovdqa %ymm10, 1216(%rax)
9732; AVX2-FCP-NEXT:    vmovdqa %ymm7, 1120(%rax)
9733; AVX2-FCP-NEXT:    vmovdqa %ymm9, 1056(%rax)
9734; AVX2-FCP-NEXT:    vmovdqa %ymm12, 1024(%rax)
9735; AVX2-FCP-NEXT:    vmovdqa %ymm14, 928(%rax)
9736; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9737; AVX2-FCP-NEXT:    vmovaps %ymm0, 864(%rax)
9738; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9739; AVX2-FCP-NEXT:    vmovaps %ymm0, 832(%rax)
9740; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9741; AVX2-FCP-NEXT:    vmovaps %ymm0, 736(%rax)
9742; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9743; AVX2-FCP-NEXT:    vmovaps %ymm0, 672(%rax)
9744; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9745; AVX2-FCP-NEXT:    vmovaps %ymm0, 640(%rax)
9746; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9747; AVX2-FCP-NEXT:    vmovaps %ymm0, 544(%rax)
9748; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9749; AVX2-FCP-NEXT:    vmovaps %ymm0, 480(%rax)
9750; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9751; AVX2-FCP-NEXT:    vmovaps %ymm0, 448(%rax)
9752; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9753; AVX2-FCP-NEXT:    vmovaps %ymm0, 352(%rax)
9754; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9755; AVX2-FCP-NEXT:    vmovaps %ymm0, 288(%rax)
9756; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9757; AVX2-FCP-NEXT:    vmovaps %ymm0, 256(%rax)
9758; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9759; AVX2-FCP-NEXT:    vmovaps %ymm0, 160(%rax)
9760; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9761; AVX2-FCP-NEXT:    vmovaps %ymm0, 96(%rax)
9762; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9763; AVX2-FCP-NEXT:    vmovaps %ymm0, 64(%rax)
9764; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9765; AVX2-FCP-NEXT:    vmovaps %ymm0, 1472(%rax)
9766; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9767; AVX2-FCP-NEXT:    vmovaps %ymm0, 1344(%rax)
9768; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9769; AVX2-FCP-NEXT:    vmovaps %ymm0, 1280(%rax)
9770; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9771; AVX2-FCP-NEXT:    vmovaps %ymm0, 1152(%rax)
9772; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9773; AVX2-FCP-NEXT:    vmovaps %ymm0, 1088(%rax)
9774; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9775; AVX2-FCP-NEXT:    vmovaps %ymm0, 960(%rax)
9776; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9777; AVX2-FCP-NEXT:    vmovaps %ymm0, 896(%rax)
9778; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9779; AVX2-FCP-NEXT:    vmovaps %ymm0, 768(%rax)
9780; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9781; AVX2-FCP-NEXT:    vmovaps %ymm0, 704(%rax)
9782; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9783; AVX2-FCP-NEXT:    vmovaps %ymm0, 576(%rax)
9784; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9785; AVX2-FCP-NEXT:    vmovaps %ymm0, 512(%rax)
9786; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9787; AVX2-FCP-NEXT:    vmovaps %ymm0, 384(%rax)
9788; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9789; AVX2-FCP-NEXT:    vmovaps %ymm0, 320(%rax)
9790; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9791; AVX2-FCP-NEXT:    vmovaps %ymm0, 192(%rax)
9792; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9793; AVX2-FCP-NEXT:    vmovaps %ymm0, 128(%rax)
9794; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9795; AVX2-FCP-NEXT:    vmovaps %ymm0, (%rax)
9796; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9797; AVX2-FCP-NEXT:    vmovaps %ymm0, 1376(%rax)
9798; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9799; AVX2-FCP-NEXT:    vmovaps %ymm0, 1184(%rax)
9800; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9801; AVX2-FCP-NEXT:    vmovaps %ymm0, 992(%rax)
9802; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9803; AVX2-FCP-NEXT:    vmovaps %ymm0, 800(%rax)
9804; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9805; AVX2-FCP-NEXT:    vmovaps %ymm0, 608(%rax)
9806; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9807; AVX2-FCP-NEXT:    vmovaps %ymm0, 416(%rax)
9808; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9809; AVX2-FCP-NEXT:    vmovaps %ymm0, 224(%rax)
9810; AVX2-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9811; AVX2-FCP-NEXT:    vmovaps %ymm0, 32(%rax)
9812; AVX2-FCP-NEXT:    addq $2376, %rsp # imm = 0x948
9813; AVX2-FCP-NEXT:    vzeroupper
9814; AVX2-FCP-NEXT:    retq
9815;
9816; AVX512-LABEL: store_i32_stride6_vf64:
9817; AVX512:       # %bb.0:
9818; AVX512-NEXT:    subq $456, %rsp # imm = 0x1C8
9819; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm9
9820; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm30
9821; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm11
9822; AVX512-NEXT:    vmovdqa64 (%rsi), %zmm10
9823; AVX512-NEXT:    vmovdqa64 64(%rsi), %zmm8
9824; AVX512-NEXT:    vmovdqa64 128(%rsi), %zmm4
9825; AVX512-NEXT:    vmovdqa64 192(%rdx), %zmm15
9826; AVX512-NEXT:    vmovdqa64 128(%rdx), %zmm28
9827; AVX512-NEXT:    vmovdqa64 64(%rdx), %zmm17
9828; AVX512-NEXT:    vmovdqa64 (%rdx), %zmm24
9829; AVX512-NEXT:    vmovdqa64 (%rcx), %zmm22
9830; AVX512-NEXT:    vmovdqa64 64(%rcx), %zmm18
9831; AVX512-NEXT:    vmovdqa64 128(%rcx), %zmm16
9832; AVX512-NEXT:    vmovdqa64 192(%rcx), %zmm13
9833; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
9834; AVX512-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
9835; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm0
9836; AVX512-NEXT:    vpermt2d %zmm10, %zmm29, %zmm0
9837; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm3
9838; AVX512-NEXT:    vmovdqa (%rdx), %ymm14
9839; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
9840; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
9841; AVX512-NEXT:    vmovdqa64 %zmm24, %zmm20
9842; AVX512-NEXT:    vpermt2d %zmm22, %zmm0, %zmm20
9843; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26]
9844; AVX512-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
9845; AVX512-NEXT:    vmovdqa64 %zmm24, %zmm23
9846; AVX512-NEXT:    vpermt2d %zmm22, %zmm6, %zmm23
9847; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28]
9848; AVX512-NEXT:    vmovdqa64 %zmm24, %zmm5
9849; AVX512-NEXT:    vpermt2d %zmm22, %zmm2, %zmm5
9850; AVX512-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9851; AVX512-NEXT:    vmovdqa64 %zmm17, %zmm25
9852; AVX512-NEXT:    vpermt2d %zmm18, %zmm0, %zmm25
9853; AVX512-NEXT:    vmovdqa64 %zmm17, %zmm26
9854; AVX512-NEXT:    vpermt2d %zmm18, %zmm6, %zmm26
9855; AVX512-NEXT:    vmovdqa64 %zmm17, %zmm5
9856; AVX512-NEXT:    vpermt2d %zmm18, %zmm2, %zmm5
9857; AVX512-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9858; AVX512-NEXT:    vmovdqa64 %zmm28, %zmm5
9859; AVX512-NEXT:    vpermt2d %zmm16, %zmm0, %zmm5
9860; AVX512-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9861; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm5
9862; AVX512-NEXT:    vmovdqa64 %zmm28, %zmm27
9863; AVX512-NEXT:    vpermt2d %zmm16, %zmm6, %zmm27
9864; AVX512-NEXT:    vmovdqa64 %zmm28, %zmm0
9865; AVX512-NEXT:    vpermt2d %zmm16, %zmm2, %zmm0
9866; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9867; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22]
9868; AVX512-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
9869; AVX512-NEXT:    vmovdqa64 %zmm24, %zmm19
9870; AVX512-NEXT:    vpermt2d %zmm22, %zmm21, %zmm19
9871; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30]
9872; AVX512-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
9873; AVX512-NEXT:    vpermt2d %zmm22, %zmm0, %zmm24
9874; AVX512-NEXT:    vmovdqa64 %zmm17, %zmm22
9875; AVX512-NEXT:    vpermt2d %zmm18, %zmm21, %zmm22
9876; AVX512-NEXT:    vpermt2d %zmm18, %zmm0, %zmm17
9877; AVX512-NEXT:    vmovdqa64 %zmm28, %zmm18
9878; AVX512-NEXT:    vpermt2d %zmm16, %zmm21, %zmm18
9879; AVX512-NEXT:    vpermt2d %zmm16, %zmm0, %zmm28
9880; AVX512-NEXT:    vmovdqa 64(%rdx), %ymm1
9881; AVX512-NEXT:    vpermi2d %zmm13, %zmm15, %zmm5
9882; AVX512-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9883; AVX512-NEXT:    vpermi2d %zmm13, %zmm15, %zmm6
9884; AVX512-NEXT:    vmovdqu64 %zmm6, (%rsp) # 64-byte Spill
9885; AVX512-NEXT:    vpermi2d %zmm13, %zmm15, %zmm2
9886; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9887; AVX512-NEXT:    vpermi2d %zmm13, %zmm15, %zmm21
9888; AVX512-NEXT:    vpermt2d %zmm13, %zmm0, %zmm15
9889; AVX512-NEXT:    vmovdqa 128(%rdx), %ymm0
9890; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12]
9891; AVX512-NEXT:    vpermt2d (%rcx), %ymm2, %ymm14
9892; AVX512-NEXT:    movb $36, %al
9893; AVX512-NEXT:    kmovw %eax, %k1
9894; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm14[0,1,0,1,2,3,6,7]
9895; AVX512-NEXT:    vmovdqa64 %zmm30, %zmm13
9896; AVX512-NEXT:    vpermt2d %zmm8, %zmm29, %zmm13
9897; AVX512-NEXT:    vpermt2d 64(%rcx), %ymm2, %ymm1
9898; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[0,1,0,1,2,3,6,7]
9899; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm14
9900; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm1
9901; AVX512-NEXT:    vpermt2d %zmm4, %zmm29, %zmm14
9902; AVX512-NEXT:    vpermt2d 128(%rcx), %ymm2, %ymm0
9903; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm0[0,1,0,1,2,3,6,7]
9904; AVX512-NEXT:    vmovdqa 192(%rdx), %ymm0
9905; AVX512-NEXT:    vpermt2d 192(%rcx), %ymm2, %ymm0
9906; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm12
9907; AVX512-NEXT:    vmovdqa64 192(%rsi), %zmm7
9908; AVX512-NEXT:    vpermi2d %zmm7, %zmm12, %zmm29
9909; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm0[0,1,0,1,2,3,6,7]
9910; AVX512-NEXT:    vmovdqa64 (%r8), %zmm4
9911; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15]
9912; AVX512-NEXT:    vpermt2d %zmm4, %zmm0, %zmm3
9913; AVX512-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9914; AVX512-NEXT:    vmovdqa64 64(%r8), %zmm3
9915; AVX512-NEXT:    vpermt2d %zmm3, %zmm0, %zmm13
9916; AVX512-NEXT:    vmovdqa64 128(%r8), %zmm2
9917; AVX512-NEXT:    vpermt2d %zmm2, %zmm0, %zmm14
9918; AVX512-NEXT:    vmovdqa64 192(%r8), %zmm6
9919; AVX512-NEXT:    vpermt2d %zmm6, %zmm0, %zmm29
9920; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm16
9921; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
9922; AVX512-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
9923; AVX512-NEXT:    vpermt2d %zmm10, %zmm31, %zmm16
9924; AVX512-NEXT:    movb $-110, %al
9925; AVX512-NEXT:    kmovw %eax, %k2
9926; AVX512-NEXT:    vmovdqa64 %zmm20, %zmm16 {%k2}
9927; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm20
9928; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
9929; AVX512-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
9930; AVX512-NEXT:    vpermt2d %zmm10, %zmm5, %zmm20
9931; AVX512-NEXT:    vmovdqa64 %zmm23, %zmm20 {%k2}
9932; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm23
9933; AVX512-NEXT:    vpunpckhdq {{.*#+}} zmm0 = zmm11[2],zmm10[2],zmm11[3],zmm10[3],zmm11[6],zmm10[6],zmm11[7],zmm10[7],zmm11[10],zmm10[10],zmm11[11],zmm10[11],zmm11[14],zmm10[14],zmm11[15],zmm10[15]
9934; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9935; AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
9936; AVX512-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
9937; AVX512-NEXT:    vpermt2d %zmm10, %zmm11, %zmm23
9938; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9939; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k1}
9940; AVX512-NEXT:    vmovdqa64 %zmm30, %zmm10
9941; AVX512-NEXT:    vpermt2d %zmm8, %zmm31, %zmm10
9942; AVX512-NEXT:    vmovdqa64 %zmm25, %zmm10 {%k2}
9943; AVX512-NEXT:    vmovdqa64 %zmm30, %zmm25
9944; AVX512-NEXT:    vpermt2d %zmm8, %zmm5, %zmm25
9945; AVX512-NEXT:    vmovdqa64 %zmm26, %zmm25 {%k2}
9946; AVX512-NEXT:    vmovdqa64 %zmm30, %zmm26
9947; AVX512-NEXT:    vpunpckhdq {{.*#+}} zmm30 = zmm30[2],zmm8[2],zmm30[3],zmm8[3],zmm30[6],zmm8[6],zmm30[7],zmm8[7],zmm30[10],zmm8[10],zmm30[11],zmm8[11],zmm30[14],zmm8[14],zmm30[15],zmm8[15]
9948; AVX512-NEXT:    vpermt2d %zmm8, %zmm11, %zmm26
9949; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9950; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
9951; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm0
9952; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm8
9953; AVX512-NEXT:    vpermt2d %zmm1, %zmm31, %zmm8
9954; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
9955; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm8 {%k2}
9956; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm9
9957; AVX512-NEXT:    vpermt2d %zmm1, %zmm5, %zmm9
9958; AVX512-NEXT:    vmovdqa64 %zmm27, %zmm9 {%k2}
9959; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm27
9960; AVX512-NEXT:    vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
9961; AVX512-NEXT:    vpermt2d %zmm1, %zmm11, %zmm27
9962; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9963; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
9964; AVX512-NEXT:    vpermi2d %zmm7, %zmm12, %zmm31
9965; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9966; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k2}
9967; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15]
9968; AVX512-NEXT:    vpermt2d %zmm4, %zmm1, %zmm16
9969; AVX512-NEXT:    vpermt2d %zmm3, %zmm1, %zmm10
9970; AVX512-NEXT:    vpermt2d %zmm2, %zmm1, %zmm8
9971; AVX512-NEXT:    vpermt2d %zmm6, %zmm1, %zmm31
9972; AVX512-NEXT:    vpermi2d %zmm7, %zmm12, %zmm5
9973; AVX512-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
9974; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k2}
9975; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15]
9976; AVX512-NEXT:    vpermt2d %zmm4, %zmm1, %zmm20
9977; AVX512-NEXT:    vpermt2d %zmm3, %zmm1, %zmm25
9978; AVX512-NEXT:    vpermt2d %zmm2, %zmm1, %zmm9
9979; AVX512-NEXT:    vpermt2d %zmm6, %zmm1, %zmm5
9980; AVX512-NEXT:    vpermi2d %zmm7, %zmm12, %zmm11
9981; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9982; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm11 {%k1}
9983; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15]
9984; AVX512-NEXT:    vpermt2d %zmm4, %zmm1, %zmm23
9985; AVX512-NEXT:    vpermt2d %zmm3, %zmm1, %zmm26
9986; AVX512-NEXT:    vpermt2d %zmm2, %zmm1, %zmm27
9987; AVX512-NEXT:    vpermt2d %zmm6, %zmm1, %zmm11
9988; AVX512-NEXT:    vmovdqa (%rdi), %ymm1
9989; AVX512-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
9990; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[2,3,2,3,2,3,2,3]
9991; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm1
9992; AVX512-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
9993; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[2,3,2,3,2,3,2,3]
9994; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm1
9995; AVX512-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
9996; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm1[2,3,2,3,2,3,2,3]
9997; AVX512-NEXT:    vmovdqa 192(%rdi), %ymm1
9998; AVX512-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
9999; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[2,3,2,3,2,3,2,3]
10000; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0]
10001; AVX512-NEXT:    vpermt2d %zmm4, %zmm1, %zmm19
10002; AVX512-NEXT:    vpermt2d %zmm3, %zmm1, %zmm22
10003; AVX512-NEXT:    vpermt2d %zmm2, %zmm1, %zmm18
10004; AVX512-NEXT:    vpermt2d %zmm6, %zmm1, %zmm21
10005; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10006; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[6,7,6,7,6,7,6,7]
10007; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0]
10008; AVX512-NEXT:    vpermt2d %zmm4, %zmm1, %zmm24
10009; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm30[6,7,6,7,6,7,6,7]
10010; AVX512-NEXT:    vpermt2d %zmm3, %zmm1, %zmm17
10011; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm0[6,7,6,7,6,7,6,7]
10012; AVX512-NEXT:    vpermt2d %zmm2, %zmm1, %zmm28
10013; AVX512-NEXT:    vmovdqa64 (%r9), %zmm2
10014; AVX512-NEXT:    vpunpckhdq {{.*#+}} zmm3 = zmm12[2],zmm7[2],zmm12[3],zmm7[3],zmm12[6],zmm7[6],zmm12[7],zmm7[7],zmm12[10],zmm7[10],zmm12[11],zmm7[11],zmm12[14],zmm7[14],zmm12[15],zmm7[15]
10015; AVX512-NEXT:    vmovdqa64 64(%r9), %zmm4
10016; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm3[6,7,6,7,6,7,6,7]
10017; AVX512-NEXT:    vmovdqa64 128(%r9), %zmm3
10018; AVX512-NEXT:    vpermt2d %zmm6, %zmm1, %zmm15
10019; AVX512-NEXT:    vmovdqa64 192(%r9), %zmm1
10020; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
10021; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
10022; AVX512-NEXT:    vpermt2d %zmm2, %zmm0, %zmm6
10023; AVX512-NEXT:    vpermt2d %zmm4, %zmm0, %zmm13
10024; AVX512-NEXT:    vpermt2d %zmm3, %zmm0, %zmm14
10025; AVX512-NEXT:    vpermt2d %zmm1, %zmm0, %zmm29
10026; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
10027; AVX512-NEXT:    vpermt2d %zmm2, %zmm0, %zmm16
10028; AVX512-NEXT:    vpermt2d %zmm4, %zmm0, %zmm10
10029; AVX512-NEXT:    vpermt2d %zmm3, %zmm0, %zmm8
10030; AVX512-NEXT:    vpermt2d %zmm1, %zmm0, %zmm31
10031; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
10032; AVX512-NEXT:    vpermt2d %zmm2, %zmm0, %zmm20
10033; AVX512-NEXT:    vpermt2d %zmm4, %zmm0, %zmm25
10034; AVX512-NEXT:    vpermt2d %zmm3, %zmm0, %zmm9
10035; AVX512-NEXT:    vpermt2d %zmm1, %zmm0, %zmm5
10036; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
10037; AVX512-NEXT:    vpermt2d %zmm2, %zmm0, %zmm23
10038; AVX512-NEXT:    vpermt2d %zmm4, %zmm0, %zmm26
10039; AVX512-NEXT:    vpermt2d %zmm3, %zmm0, %zmm27
10040; AVX512-NEXT:    vpermt2d %zmm1, %zmm0, %zmm11
10041; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
10042; AVX512-NEXT:    vpermt2d %zmm2, %zmm0, %zmm19
10043; AVX512-NEXT:    vpermt2d %zmm4, %zmm0, %zmm22
10044; AVX512-NEXT:    vpermt2d %zmm3, %zmm0, %zmm18
10045; AVX512-NEXT:    vpermt2d %zmm1, %zmm0, %zmm21
10046; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
10047; AVX512-NEXT:    vpermt2d %zmm2, %zmm0, %zmm24
10048; AVX512-NEXT:    vpermt2d %zmm4, %zmm0, %zmm17
10049; AVX512-NEXT:    vpermt2d %zmm3, %zmm0, %zmm28
10050; AVX512-NEXT:    vpermt2d %zmm1, %zmm0, %zmm15
10051; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10052; AVX512-NEXT:    vmovdqa64 %zmm15, 1472(%rax)
10053; AVX512-NEXT:    vmovdqa64 %zmm11, 1408(%rax)
10054; AVX512-NEXT:    vmovdqa64 %zmm5, 1344(%rax)
10055; AVX512-NEXT:    vmovdqa64 %zmm31, 1152(%rax)
10056; AVX512-NEXT:    vmovdqa64 %zmm28, 1088(%rax)
10057; AVX512-NEXT:    vmovdqa64 %zmm27, 1024(%rax)
10058; AVX512-NEXT:    vmovdqa64 %zmm9, 960(%rax)
10059; AVX512-NEXT:    vmovdqa64 %zmm8, 768(%rax)
10060; AVX512-NEXT:    vmovdqa64 %zmm17, 704(%rax)
10061; AVX512-NEXT:    vmovdqa64 %zmm26, 640(%rax)
10062; AVX512-NEXT:    vmovdqa64 %zmm25, 576(%rax)
10063; AVX512-NEXT:    vmovdqa64 %zmm10, 384(%rax)
10064; AVX512-NEXT:    vmovdqa64 %zmm24, 320(%rax)
10065; AVX512-NEXT:    vmovdqa64 %zmm23, 256(%rax)
10066; AVX512-NEXT:    vmovdqa64 %zmm20, 192(%rax)
10067; AVX512-NEXT:    vmovdqa64 %zmm16, (%rax)
10068; AVX512-NEXT:    vmovdqa64 %zmm21, 1280(%rax)
10069; AVX512-NEXT:    vmovdqa64 %zmm29, 1216(%rax)
10070; AVX512-NEXT:    vmovdqa64 %zmm18, 896(%rax)
10071; AVX512-NEXT:    vmovdqa64 %zmm14, 832(%rax)
10072; AVX512-NEXT:    vmovdqa64 %zmm22, 512(%rax)
10073; AVX512-NEXT:    vmovdqa64 %zmm13, 448(%rax)
10074; AVX512-NEXT:    vmovdqa64 %zmm19, 128(%rax)
10075; AVX512-NEXT:    vmovdqa64 %zmm6, 64(%rax)
10076; AVX512-NEXT:    addq $456, %rsp # imm = 0x1C8
10077; AVX512-NEXT:    vzeroupper
10078; AVX512-NEXT:    retq
10079;
10080; AVX512-FCP-LABEL: store_i32_stride6_vf64:
10081; AVX512-FCP:       # %bb.0:
10082; AVX512-FCP-NEXT:    subq $1160, %rsp # imm = 0x488
10083; AVX512-FCP-NEXT:    vmovdqa64 (%rdi), %zmm8
10084; AVX512-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
10085; AVX512-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm3
10086; AVX512-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm2
10087; AVX512-FCP-NEXT:    vmovdqa64 (%rsi), %zmm0
10088; AVX512-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm24
10089; AVX512-FCP-NEXT:    vmovdqa64 128(%rsi), %zmm29
10090; AVX512-FCP-NEXT:    vmovdqa64 192(%rsi), %zmm23
10091; AVX512-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
10092; AVX512-FCP-NEXT:    vmovdqa64 (%rcx), %zmm21
10093; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
10094; AVX512-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
10095; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm6
10096; AVX512-FCP-NEXT:    vpermt2d %zmm21, %zmm20, %zmm6
10097; AVX512-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10098; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
10099; AVX512-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
10100; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
10101; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm1
10102; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm16
10103; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0]
10104; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
10105; AVX512-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
10106; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
10107; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm10, %zmm1
10108; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm17
10109; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23]
10110; AVX512-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10111; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm31
10112; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm25
10113; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm1, %zmm31
10114; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
10115; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
10116; AVX512-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
10117; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
10118; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm11, %zmm1
10119; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm27
10120; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
10121; AVX512-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
10122; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
10123; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm12, %zmm1
10124; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10125; AVX512-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31]
10126; AVX512-FCP-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10127; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm30, %zmm25
10128; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
10129; AVX512-FCP-NEXT:    vpermt2d %zmm24, %zmm7, %zmm0
10130; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10131; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
10132; AVX512-FCP-NEXT:    vpermt2d %zmm24, %zmm10, %zmm0
10133; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10134; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
10135; AVX512-FCP-NEXT:    vpermt2d %zmm24, %zmm8, %zmm0
10136; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10137; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
10138; AVX512-FCP-NEXT:    vpermt2d %zmm24, %zmm11, %zmm0
10139; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10140; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
10141; AVX512-FCP-NEXT:    vpermt2d %zmm24, %zmm12, %zmm0
10142; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10143; AVX512-FCP-NEXT:    vpermt2d %zmm24, %zmm30, %zmm5
10144; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10145; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
10146; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
10147; AVX512-FCP-NEXT:    vpermt2d %zmm29, %zmm7, %zmm1
10148; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10149; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
10150; AVX512-FCP-NEXT:    vpermt2d %zmm29, %zmm10, %zmm1
10151; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10152; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
10153; AVX512-FCP-NEXT:    vpermt2d %zmm29, %zmm8, %zmm1
10154; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10155; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm26
10156; AVX512-FCP-NEXT:    vpermt2d %zmm29, %zmm11, %zmm26
10157; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm28
10158; AVX512-FCP-NEXT:    vpermt2d %zmm29, %zmm12, %zmm28
10159; AVX512-FCP-NEXT:    vpermt2d %zmm29, %zmm30, %zmm0
10160; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10161; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm19
10162; AVX512-FCP-NEXT:    vpermt2d %zmm21, %zmm9, %zmm19
10163; AVX512-FCP-NEXT:    vpermi2d %zmm23, %zmm2, %zmm7
10164; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10165; AVX512-FCP-NEXT:    vpermi2d %zmm23, %zmm2, %zmm10
10166; AVX512-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10167; AVX512-FCP-NEXT:    vpermi2d %zmm23, %zmm2, %zmm8
10168; AVX512-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10169; AVX512-FCP-NEXT:    vpermi2d %zmm23, %zmm2, %zmm11
10170; AVX512-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10171; AVX512-FCP-NEXT:    vpermi2d %zmm23, %zmm2, %zmm12
10172; AVX512-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10173; AVX512-FCP-NEXT:    vpermt2d %zmm23, %zmm30, %zmm2
10174; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10175; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22]
10176; AVX512-FCP-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
10177; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm5
10178; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm23
10179; AVX512-FCP-NEXT:    vpermt2d %zmm21, %zmm29, %zmm23
10180; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26]
10181; AVX512-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
10182; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm18
10183; AVX512-FCP-NEXT:    vpermt2d %zmm21, %zmm4, %zmm18
10184; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28]
10185; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm7
10186; AVX512-FCP-NEXT:    vpermt2d %zmm21, %zmm8, %zmm7
10187; AVX512-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30]
10188; AVX512-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
10189; AVX512-FCP-NEXT:    vpermt2d %zmm21, %zmm2, %zmm5
10190; AVX512-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm21
10191; AVX512-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm0
10192; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm22
10193; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm20, %zmm22
10194; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm14
10195; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm24
10196; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm9, %zmm14
10197; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm30
10198; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm29, %zmm30
10199; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm10
10200; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm4, %zmm10
10201; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm6
10202; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm8, %zmm6
10203; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm21
10204; AVX512-FCP-NEXT:    vmovdqa64 128(%rdx), %zmm13
10205; AVX512-FCP-NEXT:    vmovdqa64 128(%rcx), %zmm0
10206; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm15
10207; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm20, %zmm15
10208; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm12
10209; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm9, %zmm12
10210; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm3
10211; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm29, %zmm3
10212; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm11
10213; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm4, %zmm11
10214; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, %zmm9
10215; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm8, %zmm9
10216; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm13
10217; AVX512-FCP-NEXT:    vmovdqa64 192(%rdx), %zmm1
10218; AVX512-FCP-NEXT:    vmovdqa64 192(%rcx), %zmm0
10219; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm20
10220; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm24
10221; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm29
10222; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm4
10223; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
10224; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm1
10225; AVX512-FCP-NEXT:    movb $-110, %al
10226; AVX512-FCP-NEXT:    kmovw %eax, %k2
10227; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10228; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k2}
10229; AVX512-FCP-NEXT:    movb $36, %al
10230; AVX512-FCP-NEXT:    kmovw %eax, %k1
10231; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, %zmm17 {%k1}
10232; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, %zmm23 {%k1}
10233; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, %zmm27 {%k2}
10234; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
10235; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm19 {%k1}
10236; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm5 {%k1}
10237; AVX512-FCP-NEXT:    vmovdqa64 (%r8), %zmm0
10238; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15]
10239; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm16
10240; AVX512-FCP-NEXT:    vmovdqu64 %zmm16, (%rsp) # 64-byte Spill
10241; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15]
10242; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm17
10243; AVX512-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10244; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0]
10245; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm23
10246; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm25 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15]
10247; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm27
10248; AVX512-FCP-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10249; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15]
10250; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm17, %zmm19
10251; AVX512-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10252; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm31 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0]
10253; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm31, %zmm5
10254; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
10255; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, %zmm27 {%k2}
10256; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
10257; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, %zmm22 {%k1}
10258; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10259; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k1}
10260; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
10261; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm16 {%k2}
10262; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
10263; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm10 {%k1}
10264; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10265; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
10266; AVX512-FCP-NEXT:    vmovdqa64 64(%r8), %zmm0
10267; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm27
10268; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm22
10269; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm30
10270; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm16
10271; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, %zmm19
10272; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm17, %zmm10
10273; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, %zmm14
10274; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, %zmm17
10275; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm31, %zmm21
10276; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
10277; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm16 {%k2}
10278; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
10279; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm15 {%k1}
10280; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10281; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k1}
10282; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm26 {%k2}
10283; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm28 {%k1}
10284; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10285; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
10286; AVX512-FCP-NEXT:    vmovdqa64 128(%r8), %zmm0
10287; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm16
10288; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm15
10289; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm3
10290; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm26
10291; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm14, %zmm28
10292; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm31, %zmm13
10293; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
10294; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, %zmm6 {%k2}
10295; AVX512-FCP-NEXT:    vmovdqa64 192(%r8), %zmm0
10296; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm6
10297; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm9
10298; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10299; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, %zmm2 {%k1}
10300; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm2
10301; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm10
10302; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10303; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm29 {%k1}
10304; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm29
10305; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
10306; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k2}
10307; AVX512-FCP-NEXT:    vmovdqa64 (%r9), %zmm2
10308; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm6
10309; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm12
10310; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
10311; AVX512-FCP-NEXT:    vmovdqu64 (%rsp), %zmm18 # 64-byte Reload
10312; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm4, %zmm18
10313; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
10314; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, %zmm6 {%k1}
10315; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
10316; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
10317; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm11, %zmm20
10318; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm14, %zmm6
10319; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm14
10320; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
10321; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm6, %zmm23
10322; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10323; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm1 {%k1}
10324; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
10325; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
10326; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm7, %zmm24
10327; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm31, %zmm1
10328; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
10329; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
10330; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm25
10331; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
10332; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm8, %zmm5
10333; AVX512-FCP-NEXT:    vmovdqa64 64(%r9), %zmm2
10334; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm4, %zmm27
10335; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm11, %zmm22
10336; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm6, %zmm30
10337; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm7, %zmm19
10338; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm17
10339; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm8, %zmm21
10340; AVX512-FCP-NEXT:    vmovdqa64 128(%r9), %zmm2
10341; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm4, %zmm16
10342; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm11, %zmm15
10343; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm6, %zmm3
10344; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm7, %zmm26
10345; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm28
10346; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm8, %zmm13
10347; AVX512-FCP-NEXT:    vmovdqa64 192(%r9), %zmm2
10348; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm4, %zmm9
10349; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm11, %zmm10
10350; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm6, %zmm29
10351; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm7, %zmm12
10352; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm14
10353; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm8, %zmm1
10354; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10355; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 1472(%rax)
10356; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, 1408(%rax)
10357; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, 1344(%rax)
10358; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, 1280(%rax)
10359; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, 1216(%rax)
10360; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, 1152(%rax)
10361; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, 1088(%rax)
10362; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, 1024(%rax)
10363; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, 960(%rax)
10364; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 896(%rax)
10365; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, 832(%rax)
10366; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, 768(%rax)
10367; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, 704(%rax)
10368; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, 640(%rax)
10369; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, 576(%rax)
10370; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, 512(%rax)
10371; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, 448(%rax)
10372; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, 384(%rax)
10373; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 320(%rax)
10374; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, 256(%rax)
10375; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, 192(%rax)
10376; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, 128(%rax)
10377; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, 64(%rax)
10378; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, (%rax)
10379; AVX512-FCP-NEXT:    addq $1160, %rsp # imm = 0x488
10380; AVX512-FCP-NEXT:    vzeroupper
10381; AVX512-FCP-NEXT:    retq
10382;
10383; AVX512DQ-LABEL: store_i32_stride6_vf64:
10384; AVX512DQ:       # %bb.0:
10385; AVX512DQ-NEXT:    subq $456, %rsp # imm = 0x1C8
10386; AVX512DQ-NEXT:    vmovdqa64 128(%rdi), %zmm9
10387; AVX512DQ-NEXT:    vmovdqa64 64(%rdi), %zmm30
10388; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm11
10389; AVX512DQ-NEXT:    vmovdqa64 (%rsi), %zmm10
10390; AVX512DQ-NEXT:    vmovdqa64 64(%rsi), %zmm8
10391; AVX512DQ-NEXT:    vmovdqa64 128(%rsi), %zmm4
10392; AVX512DQ-NEXT:    vmovdqa64 192(%rdx), %zmm15
10393; AVX512DQ-NEXT:    vmovdqa64 128(%rdx), %zmm28
10394; AVX512DQ-NEXT:    vmovdqa64 64(%rdx), %zmm17
10395; AVX512DQ-NEXT:    vmovdqa64 (%rdx), %zmm24
10396; AVX512DQ-NEXT:    vmovdqa64 (%rcx), %zmm22
10397; AVX512DQ-NEXT:    vmovdqa64 64(%rcx), %zmm18
10398; AVX512DQ-NEXT:    vmovdqa64 128(%rcx), %zmm16
10399; AVX512DQ-NEXT:    vmovdqa64 192(%rcx), %zmm13
10400; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
10401; AVX512DQ-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
10402; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm0
10403; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm29, %zmm0
10404; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm3
10405; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm14
10406; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
10407; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
10408; AVX512DQ-NEXT:    vmovdqa64 %zmm24, %zmm20
10409; AVX512DQ-NEXT:    vpermt2d %zmm22, %zmm0, %zmm20
10410; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26]
10411; AVX512DQ-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
10412; AVX512DQ-NEXT:    vmovdqa64 %zmm24, %zmm23
10413; AVX512DQ-NEXT:    vpermt2d %zmm22, %zmm6, %zmm23
10414; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28]
10415; AVX512DQ-NEXT:    vmovdqa64 %zmm24, %zmm5
10416; AVX512DQ-NEXT:    vpermt2d %zmm22, %zmm2, %zmm5
10417; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10418; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm25
10419; AVX512DQ-NEXT:    vpermt2d %zmm18, %zmm0, %zmm25
10420; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm26
10421; AVX512DQ-NEXT:    vpermt2d %zmm18, %zmm6, %zmm26
10422; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm5
10423; AVX512DQ-NEXT:    vpermt2d %zmm18, %zmm2, %zmm5
10424; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10425; AVX512DQ-NEXT:    vmovdqa64 %zmm28, %zmm5
10426; AVX512DQ-NEXT:    vpermt2d %zmm16, %zmm0, %zmm5
10427; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10428; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm5
10429; AVX512DQ-NEXT:    vmovdqa64 %zmm28, %zmm27
10430; AVX512DQ-NEXT:    vpermt2d %zmm16, %zmm6, %zmm27
10431; AVX512DQ-NEXT:    vmovdqa64 %zmm28, %zmm0
10432; AVX512DQ-NEXT:    vpermt2d %zmm16, %zmm2, %zmm0
10433; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10434; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22]
10435; AVX512DQ-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
10436; AVX512DQ-NEXT:    vmovdqa64 %zmm24, %zmm19
10437; AVX512DQ-NEXT:    vpermt2d %zmm22, %zmm21, %zmm19
10438; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30]
10439; AVX512DQ-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
10440; AVX512DQ-NEXT:    vpermt2d %zmm22, %zmm0, %zmm24
10441; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm22
10442; AVX512DQ-NEXT:    vpermt2d %zmm18, %zmm21, %zmm22
10443; AVX512DQ-NEXT:    vpermt2d %zmm18, %zmm0, %zmm17
10444; AVX512DQ-NEXT:    vmovdqa64 %zmm28, %zmm18
10445; AVX512DQ-NEXT:    vpermt2d %zmm16, %zmm21, %zmm18
10446; AVX512DQ-NEXT:    vpermt2d %zmm16, %zmm0, %zmm28
10447; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %ymm1
10448; AVX512DQ-NEXT:    vpermi2d %zmm13, %zmm15, %zmm5
10449; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10450; AVX512DQ-NEXT:    vpermi2d %zmm13, %zmm15, %zmm6
10451; AVX512DQ-NEXT:    vmovdqu64 %zmm6, (%rsp) # 64-byte Spill
10452; AVX512DQ-NEXT:    vpermi2d %zmm13, %zmm15, %zmm2
10453; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10454; AVX512DQ-NEXT:    vpermi2d %zmm13, %zmm15, %zmm21
10455; AVX512DQ-NEXT:    vpermt2d %zmm13, %zmm0, %zmm15
10456; AVX512DQ-NEXT:    vmovdqa 128(%rdx), %ymm0
10457; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12]
10458; AVX512DQ-NEXT:    vpermt2d (%rcx), %ymm2, %ymm14
10459; AVX512DQ-NEXT:    movb $36, %al
10460; AVX512DQ-NEXT:    kmovw %eax, %k1
10461; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm14[0,1,0,1,2,3,6,7]
10462; AVX512DQ-NEXT:    vmovdqa64 %zmm30, %zmm13
10463; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm29, %zmm13
10464; AVX512DQ-NEXT:    vpermt2d 64(%rcx), %ymm2, %ymm1
10465; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[0,1,0,1,2,3,6,7]
10466; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm14
10467; AVX512DQ-NEXT:    vmovdqa64 %zmm4, %zmm1
10468; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm29, %zmm14
10469; AVX512DQ-NEXT:    vpermt2d 128(%rcx), %ymm2, %ymm0
10470; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm0[0,1,0,1,2,3,6,7]
10471; AVX512DQ-NEXT:    vmovdqa 192(%rdx), %ymm0
10472; AVX512DQ-NEXT:    vpermt2d 192(%rcx), %ymm2, %ymm0
10473; AVX512DQ-NEXT:    vmovdqa64 192(%rdi), %zmm12
10474; AVX512DQ-NEXT:    vmovdqa64 192(%rsi), %zmm7
10475; AVX512DQ-NEXT:    vpermi2d %zmm7, %zmm12, %zmm29
10476; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm0[0,1,0,1,2,3,6,7]
10477; AVX512DQ-NEXT:    vmovdqa64 (%r8), %zmm4
10478; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15]
10479; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm0, %zmm3
10480; AVX512DQ-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10481; AVX512DQ-NEXT:    vmovdqa64 64(%r8), %zmm3
10482; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm0, %zmm13
10483; AVX512DQ-NEXT:    vmovdqa64 128(%r8), %zmm2
10484; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm14
10485; AVX512DQ-NEXT:    vmovdqa64 192(%r8), %zmm6
10486; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm0, %zmm29
10487; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm16
10488; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
10489; AVX512DQ-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
10490; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm31, %zmm16
10491; AVX512DQ-NEXT:    movb $-110, %al
10492; AVX512DQ-NEXT:    kmovw %eax, %k2
10493; AVX512DQ-NEXT:    vmovdqa64 %zmm20, %zmm16 {%k2}
10494; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm20
10495; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
10496; AVX512DQ-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
10497; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm5, %zmm20
10498; AVX512DQ-NEXT:    vmovdqa64 %zmm23, %zmm20 {%k2}
10499; AVX512DQ-NEXT:    vmovdqa64 %zmm11, %zmm23
10500; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} zmm0 = zmm11[2],zmm10[2],zmm11[3],zmm10[3],zmm11[6],zmm10[6],zmm11[7],zmm10[7],zmm11[10],zmm10[10],zmm11[11],zmm10[11],zmm11[14],zmm10[14],zmm11[15],zmm10[15]
10501; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10502; AVX512DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
10503; AVX512DQ-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
10504; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm11, %zmm23
10505; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10506; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k1}
10507; AVX512DQ-NEXT:    vmovdqa64 %zmm30, %zmm10
10508; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm31, %zmm10
10509; AVX512DQ-NEXT:    vmovdqa64 %zmm25, %zmm10 {%k2}
10510; AVX512DQ-NEXT:    vmovdqa64 %zmm30, %zmm25
10511; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm5, %zmm25
10512; AVX512DQ-NEXT:    vmovdqa64 %zmm26, %zmm25 {%k2}
10513; AVX512DQ-NEXT:    vmovdqa64 %zmm30, %zmm26
10514; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} zmm30 = zmm30[2],zmm8[2],zmm30[3],zmm8[3],zmm30[6],zmm8[6],zmm30[7],zmm8[7],zmm30[10],zmm8[10],zmm30[11],zmm8[11],zmm30[14],zmm8[14],zmm30[15],zmm8[15]
10515; AVX512DQ-NEXT:    vpermt2d %zmm8, %zmm11, %zmm26
10516; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10517; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
10518; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm0
10519; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm8
10520; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm31, %zmm8
10521; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
10522; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm8 {%k2}
10523; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm9
10524; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm5, %zmm9
10525; AVX512DQ-NEXT:    vmovdqa64 %zmm27, %zmm9 {%k2}
10526; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm27
10527; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
10528; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm11, %zmm27
10529; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10530; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
10531; AVX512DQ-NEXT:    vpermi2d %zmm7, %zmm12, %zmm31
10532; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10533; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k2}
10534; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15]
10535; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm1, %zmm16
10536; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm1, %zmm10
10537; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm1, %zmm8
10538; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm1, %zmm31
10539; AVX512DQ-NEXT:    vpermi2d %zmm7, %zmm12, %zmm5
10540; AVX512DQ-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
10541; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k2}
10542; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15]
10543; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm1, %zmm20
10544; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm1, %zmm25
10545; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm1, %zmm9
10546; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm1, %zmm5
10547; AVX512DQ-NEXT:    vpermi2d %zmm7, %zmm12, %zmm11
10548; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10549; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm11 {%k1}
10550; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15]
10551; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm1, %zmm23
10552; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm1, %zmm26
10553; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm1, %zmm27
10554; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm1, %zmm11
10555; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm1
10556; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
10557; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[2,3,2,3,2,3,2,3]
10558; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm1
10559; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
10560; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[2,3,2,3,2,3,2,3]
10561; AVX512DQ-NEXT:    vmovdqa 128(%rdi), %ymm1
10562; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
10563; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm1[2,3,2,3,2,3,2,3]
10564; AVX512DQ-NEXT:    vmovdqa 192(%rdi), %ymm1
10565; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
10566; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[2,3,2,3,2,3,2,3]
10567; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0]
10568; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm1, %zmm19
10569; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm1, %zmm22
10570; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm1, %zmm18
10571; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm1, %zmm21
10572; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10573; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[6,7,6,7,6,7,6,7]
10574; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0]
10575; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm1, %zmm24
10576; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm30[6,7,6,7,6,7,6,7]
10577; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm1, %zmm17
10578; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm0[6,7,6,7,6,7,6,7]
10579; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm1, %zmm28
10580; AVX512DQ-NEXT:    vmovdqa64 (%r9), %zmm2
10581; AVX512DQ-NEXT:    vpunpckhdq {{.*#+}} zmm3 = zmm12[2],zmm7[2],zmm12[3],zmm7[3],zmm12[6],zmm7[6],zmm12[7],zmm7[7],zmm12[10],zmm7[10],zmm12[11],zmm7[11],zmm12[14],zmm7[14],zmm12[15],zmm7[15]
10582; AVX512DQ-NEXT:    vmovdqa64 64(%r9), %zmm4
10583; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm3[6,7,6,7,6,7,6,7]
10584; AVX512DQ-NEXT:    vmovdqa64 128(%r9), %zmm3
10585; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm1, %zmm15
10586; AVX512DQ-NEXT:    vmovdqa64 192(%r9), %zmm1
10587; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
10588; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
10589; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm6
10590; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm0, %zmm13
10591; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm0, %zmm14
10592; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm0, %zmm29
10593; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
10594; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm16
10595; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm0, %zmm10
10596; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm0, %zmm8
10597; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm0, %zmm31
10598; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
10599; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm20
10600; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm0, %zmm25
10601; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm0, %zmm9
10602; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm0, %zmm5
10603; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
10604; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm23
10605; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm0, %zmm26
10606; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm0, %zmm27
10607; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm0, %zmm11
10608; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
10609; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm19
10610; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm0, %zmm22
10611; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm0, %zmm18
10612; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm0, %zmm21
10613; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
10614; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm0, %zmm24
10615; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm0, %zmm17
10616; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm0, %zmm28
10617; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm0, %zmm15
10618; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10619; AVX512DQ-NEXT:    vmovdqa64 %zmm15, 1472(%rax)
10620; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 1408(%rax)
10621; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 1344(%rax)
10622; AVX512DQ-NEXT:    vmovdqa64 %zmm31, 1152(%rax)
10623; AVX512DQ-NEXT:    vmovdqa64 %zmm28, 1088(%rax)
10624; AVX512DQ-NEXT:    vmovdqa64 %zmm27, 1024(%rax)
10625; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 960(%rax)
10626; AVX512DQ-NEXT:    vmovdqa64 %zmm8, 768(%rax)
10627; AVX512DQ-NEXT:    vmovdqa64 %zmm17, 704(%rax)
10628; AVX512DQ-NEXT:    vmovdqa64 %zmm26, 640(%rax)
10629; AVX512DQ-NEXT:    vmovdqa64 %zmm25, 576(%rax)
10630; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 384(%rax)
10631; AVX512DQ-NEXT:    vmovdqa64 %zmm24, 320(%rax)
10632; AVX512DQ-NEXT:    vmovdqa64 %zmm23, 256(%rax)
10633; AVX512DQ-NEXT:    vmovdqa64 %zmm20, 192(%rax)
10634; AVX512DQ-NEXT:    vmovdqa64 %zmm16, (%rax)
10635; AVX512DQ-NEXT:    vmovdqa64 %zmm21, 1280(%rax)
10636; AVX512DQ-NEXT:    vmovdqa64 %zmm29, 1216(%rax)
10637; AVX512DQ-NEXT:    vmovdqa64 %zmm18, 896(%rax)
10638; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 832(%rax)
10639; AVX512DQ-NEXT:    vmovdqa64 %zmm22, 512(%rax)
10640; AVX512DQ-NEXT:    vmovdqa64 %zmm13, 448(%rax)
10641; AVX512DQ-NEXT:    vmovdqa64 %zmm19, 128(%rax)
10642; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 64(%rax)
10643; AVX512DQ-NEXT:    addq $456, %rsp # imm = 0x1C8
10644; AVX512DQ-NEXT:    vzeroupper
10645; AVX512DQ-NEXT:    retq
10646;
10647; AVX512DQ-FCP-LABEL: store_i32_stride6_vf64:
10648; AVX512DQ-FCP:       # %bb.0:
10649; AVX512DQ-FCP-NEXT:    subq $1160, %rsp # imm = 0x488
10650; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdi), %zmm8
10651; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
10652; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm3
10653; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm2
10654; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rsi), %zmm0
10655; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm24
10656; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rsi), %zmm29
10657; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rsi), %zmm23
10658; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
10659; AVX512DQ-FCP-NEXT:    vmovdqa64 (%rcx), %zmm21
10660; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
10661; AVX512DQ-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
10662; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm6
10663; AVX512DQ-FCP-NEXT:    vpermt2d %zmm21, %zmm20, %zmm6
10664; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10665; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
10666; AVX512DQ-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
10667; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
10668; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm1
10669; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm16
10670; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0]
10671; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
10672; AVX512DQ-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
10673; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
10674; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm10, %zmm1
10675; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm17
10676; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23]
10677; AVX512DQ-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10678; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm31
10679; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm25
10680; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm1, %zmm31
10681; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
10682; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
10683; AVX512DQ-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
10684; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
10685; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm11, %zmm1
10686; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm27
10687; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
10688; AVX512DQ-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
10689; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
10690; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm12, %zmm1
10691; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10692; AVX512DQ-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31]
10693; AVX512DQ-FCP-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10694; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm30, %zmm25
10695; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
10696; AVX512DQ-FCP-NEXT:    vpermt2d %zmm24, %zmm7, %zmm0
10697; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10698; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
10699; AVX512DQ-FCP-NEXT:    vpermt2d %zmm24, %zmm10, %zmm0
10700; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10701; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
10702; AVX512DQ-FCP-NEXT:    vpermt2d %zmm24, %zmm8, %zmm0
10703; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10704; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
10705; AVX512DQ-FCP-NEXT:    vpermt2d %zmm24, %zmm11, %zmm0
10706; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10707; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
10708; AVX512DQ-FCP-NEXT:    vpermt2d %zmm24, %zmm12, %zmm0
10709; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10710; AVX512DQ-FCP-NEXT:    vpermt2d %zmm24, %zmm30, %zmm5
10711; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10712; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
10713; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
10714; AVX512DQ-FCP-NEXT:    vpermt2d %zmm29, %zmm7, %zmm1
10715; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10716; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
10717; AVX512DQ-FCP-NEXT:    vpermt2d %zmm29, %zmm10, %zmm1
10718; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10719; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
10720; AVX512DQ-FCP-NEXT:    vpermt2d %zmm29, %zmm8, %zmm1
10721; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10722; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm26
10723; AVX512DQ-FCP-NEXT:    vpermt2d %zmm29, %zmm11, %zmm26
10724; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm28
10725; AVX512DQ-FCP-NEXT:    vpermt2d %zmm29, %zmm12, %zmm28
10726; AVX512DQ-FCP-NEXT:    vpermt2d %zmm29, %zmm30, %zmm0
10727; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10728; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm19
10729; AVX512DQ-FCP-NEXT:    vpermt2d %zmm21, %zmm9, %zmm19
10730; AVX512DQ-FCP-NEXT:    vpermi2d %zmm23, %zmm2, %zmm7
10731; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10732; AVX512DQ-FCP-NEXT:    vpermi2d %zmm23, %zmm2, %zmm10
10733; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10734; AVX512DQ-FCP-NEXT:    vpermi2d %zmm23, %zmm2, %zmm8
10735; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10736; AVX512DQ-FCP-NEXT:    vpermi2d %zmm23, %zmm2, %zmm11
10737; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10738; AVX512DQ-FCP-NEXT:    vpermi2d %zmm23, %zmm2, %zmm12
10739; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10740; AVX512DQ-FCP-NEXT:    vpermt2d %zmm23, %zmm30, %zmm2
10741; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10742; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22]
10743; AVX512DQ-FCP-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
10744; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm5
10745; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm23
10746; AVX512DQ-FCP-NEXT:    vpermt2d %zmm21, %zmm29, %zmm23
10747; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26]
10748; AVX512DQ-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
10749; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm18
10750; AVX512DQ-FCP-NEXT:    vpermt2d %zmm21, %zmm4, %zmm18
10751; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28]
10752; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm7
10753; AVX512DQ-FCP-NEXT:    vpermt2d %zmm21, %zmm8, %zmm7
10754; AVX512DQ-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30]
10755; AVX512DQ-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
10756; AVX512DQ-FCP-NEXT:    vpermt2d %zmm21, %zmm2, %zmm5
10757; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm21
10758; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm0
10759; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm22
10760; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm20, %zmm22
10761; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm14
10762; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm24
10763; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm9, %zmm14
10764; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm30
10765; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm29, %zmm30
10766; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm10
10767; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm4, %zmm10
10768; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm6
10769; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm8, %zmm6
10770; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm21
10771; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rdx), %zmm13
10772; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%rcx), %zmm0
10773; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm15
10774; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm20, %zmm15
10775; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm12
10776; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm9, %zmm12
10777; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm3
10778; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm29, %zmm3
10779; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm11
10780; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm4, %zmm11
10781; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, %zmm9
10782; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm8, %zmm9
10783; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm13
10784; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rdx), %zmm1
10785; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%rcx), %zmm0
10786; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm20
10787; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm24
10788; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm29
10789; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm4
10790; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
10791; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm1
10792; AVX512DQ-FCP-NEXT:    movb $-110, %al
10793; AVX512DQ-FCP-NEXT:    kmovw %eax, %k2
10794; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10795; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k2}
10796; AVX512DQ-FCP-NEXT:    movb $36, %al
10797; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
10798; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, %zmm17 {%k1}
10799; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, %zmm23 {%k1}
10800; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, %zmm27 {%k2}
10801; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
10802; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm19 {%k1}
10803; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm5 {%k1}
10804; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r8), %zmm0
10805; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15]
10806; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm16
10807; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm16, (%rsp) # 64-byte Spill
10808; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15]
10809; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm17
10810; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10811; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0]
10812; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm23
10813; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm25 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15]
10814; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm27
10815; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10816; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15]
10817; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm17, %zmm19
10818; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10819; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm31 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0]
10820; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm31, %zmm5
10821; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
10822; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, %zmm27 {%k2}
10823; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
10824; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, %zmm22 {%k1}
10825; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10826; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k1}
10827; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
10828; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm16 {%k2}
10829; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
10830; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm10 {%k1}
10831; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10832; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
10833; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%r8), %zmm0
10834; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm27
10835; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm22
10836; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm30
10837; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm16
10838; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, %zmm19
10839; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm17, %zmm10
10840; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, %zmm14
10841; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, %zmm17
10842; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm31, %zmm21
10843; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
10844; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm16 {%k2}
10845; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
10846; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm15 {%k1}
10847; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10848; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k1}
10849; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm26 {%k2}
10850; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm28 {%k1}
10851; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10852; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
10853; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%r8), %zmm0
10854; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm16
10855; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm15
10856; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm3
10857; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm26
10858; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm14, %zmm28
10859; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm31, %zmm13
10860; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
10861; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, %zmm6 {%k2}
10862; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%r8), %zmm0
10863; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm6
10864; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm9
10865; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10866; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, %zmm2 {%k1}
10867; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm2
10868; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm10
10869; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10870; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm29 {%k1}
10871; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm29
10872; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
10873; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k2}
10874; AVX512DQ-FCP-NEXT:    vmovdqa64 (%r9), %zmm2
10875; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm6
10876; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm12
10877; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
10878; AVX512DQ-FCP-NEXT:    vmovdqu64 (%rsp), %zmm18 # 64-byte Reload
10879; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm4, %zmm18
10880; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
10881; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, %zmm6 {%k1}
10882; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
10883; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
10884; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm11, %zmm20
10885; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm14, %zmm6
10886; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm14
10887; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
10888; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm6, %zmm23
10889; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10890; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm1 {%k1}
10891; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
10892; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
10893; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm7, %zmm24
10894; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm31, %zmm1
10895; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
10896; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
10897; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm25
10898; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
10899; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm8, %zmm5
10900; AVX512DQ-FCP-NEXT:    vmovdqa64 64(%r9), %zmm2
10901; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm4, %zmm27
10902; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm11, %zmm22
10903; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm6, %zmm30
10904; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm7, %zmm19
10905; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm17
10906; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm8, %zmm21
10907; AVX512DQ-FCP-NEXT:    vmovdqa64 128(%r9), %zmm2
10908; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm4, %zmm16
10909; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm11, %zmm15
10910; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm6, %zmm3
10911; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm7, %zmm26
10912; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm28
10913; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm8, %zmm13
10914; AVX512DQ-FCP-NEXT:    vmovdqa64 192(%r9), %zmm2
10915; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm4, %zmm9
10916; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm11, %zmm10
10917; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm6, %zmm29
10918; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm7, %zmm12
10919; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm14
10920; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm8, %zmm1
10921; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
10922; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 1472(%rax)
10923; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, 1408(%rax)
10924; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, 1344(%rax)
10925; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, 1280(%rax)
10926; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, 1216(%rax)
10927; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, 1152(%rax)
10928; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 1088(%rax)
10929; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, 1024(%rax)
10930; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, 960(%rax)
10931; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 896(%rax)
10932; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, 832(%rax)
10933; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, 768(%rax)
10934; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, 704(%rax)
10935; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, 640(%rax)
10936; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, 576(%rax)
10937; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, 512(%rax)
10938; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, 448(%rax)
10939; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, 384(%rax)
10940; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 320(%rax)
10941; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, 256(%rax)
10942; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, 192(%rax)
10943; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, 128(%rax)
10944; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, 64(%rax)
10945; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, (%rax)
10946; AVX512DQ-FCP-NEXT:    addq $1160, %rsp # imm = 0x488
10947; AVX512DQ-FCP-NEXT:    vzeroupper
10948; AVX512DQ-FCP-NEXT:    retq
10949;
10950; AVX512BW-LABEL: store_i32_stride6_vf64:
10951; AVX512BW:       # %bb.0:
10952; AVX512BW-NEXT:    subq $456, %rsp # imm = 0x1C8
10953; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm9
10954; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm30
10955; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm11
10956; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm10
10957; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm8
10958; AVX512BW-NEXT:    vmovdqa64 128(%rsi), %zmm4
10959; AVX512BW-NEXT:    vmovdqa64 192(%rdx), %zmm15
10960; AVX512BW-NEXT:    vmovdqa64 128(%rdx), %zmm28
10961; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm17
10962; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm24
10963; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm22
10964; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm18
10965; AVX512BW-NEXT:    vmovdqa64 128(%rcx), %zmm16
10966; AVX512BW-NEXT:    vmovdqa64 192(%rcx), %zmm13
10967; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
10968; AVX512BW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
10969; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm0
10970; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm29, %zmm0
10971; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm3
10972; AVX512BW-NEXT:    vmovdqa (%rdx), %ymm14
10973; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
10974; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
10975; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm20
10976; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm20
10977; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26]
10978; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
10979; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm23
10980; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm6, %zmm23
10981; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28]
10982; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm5
10983; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm2, %zmm5
10984; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10985; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm25
10986; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm25
10987; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm26
10988; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm6, %zmm26
10989; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm5
10990; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm2, %zmm5
10991; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10992; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm5
10993; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm5
10994; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10995; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm5
10996; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm27
10997; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm6, %zmm27
10998; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm0
10999; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm2, %zmm0
11000; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11001; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22]
11002; AVX512BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
11003; AVX512BW-NEXT:    vmovdqa64 %zmm24, %zmm19
11004; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm21, %zmm19
11005; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30]
11006; AVX512BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
11007; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm24
11008; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm22
11009; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm21, %zmm22
11010; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm17
11011; AVX512BW-NEXT:    vmovdqa64 %zmm28, %zmm18
11012; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm21, %zmm18
11013; AVX512BW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm28
11014; AVX512BW-NEXT:    vmovdqa 64(%rdx), %ymm1
11015; AVX512BW-NEXT:    vpermi2d %zmm13, %zmm15, %zmm5
11016; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11017; AVX512BW-NEXT:    vpermi2d %zmm13, %zmm15, %zmm6
11018; AVX512BW-NEXT:    vmovdqu64 %zmm6, (%rsp) # 64-byte Spill
11019; AVX512BW-NEXT:    vpermi2d %zmm13, %zmm15, %zmm2
11020; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11021; AVX512BW-NEXT:    vpermi2d %zmm13, %zmm15, %zmm21
11022; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm0, %zmm15
11023; AVX512BW-NEXT:    vmovdqa 128(%rdx), %ymm0
11024; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12]
11025; AVX512BW-NEXT:    vpermt2d (%rcx), %ymm2, %ymm14
11026; AVX512BW-NEXT:    movb $36, %al
11027; AVX512BW-NEXT:    kmovd %eax, %k1
11028; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm14[0,1,0,1,2,3,6,7]
11029; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm13
11030; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm29, %zmm13
11031; AVX512BW-NEXT:    vpermt2d 64(%rcx), %ymm2, %ymm1
11032; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[0,1,0,1,2,3,6,7]
11033; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm14
11034; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm1
11035; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm29, %zmm14
11036; AVX512BW-NEXT:    vpermt2d 128(%rcx), %ymm2, %ymm0
11037; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm0[0,1,0,1,2,3,6,7]
11038; AVX512BW-NEXT:    vmovdqa 192(%rdx), %ymm0
11039; AVX512BW-NEXT:    vpermt2d 192(%rcx), %ymm2, %ymm0
11040; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm12
11041; AVX512BW-NEXT:    vmovdqa64 192(%rsi), %zmm7
11042; AVX512BW-NEXT:    vpermi2d %zmm7, %zmm12, %zmm29
11043; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm0[0,1,0,1,2,3,6,7]
11044; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm4
11045; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15]
11046; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm3
11047; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11048; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm3
11049; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm13
11050; AVX512BW-NEXT:    vmovdqa64 128(%r8), %zmm2
11051; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm14
11052; AVX512BW-NEXT:    vmovdqa64 192(%r8), %zmm6
11053; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm29
11054; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm16
11055; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
11056; AVX512BW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
11057; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm31, %zmm16
11058; AVX512BW-NEXT:    movb $-110, %al
11059; AVX512BW-NEXT:    kmovd %eax, %k2
11060; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm16 {%k2}
11061; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm20
11062; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
11063; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
11064; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm5, %zmm20
11065; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm20 {%k2}
11066; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm23
11067; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} zmm0 = zmm11[2],zmm10[2],zmm11[3],zmm10[3],zmm11[6],zmm10[6],zmm11[7],zmm10[7],zmm11[10],zmm10[10],zmm11[11],zmm10[11],zmm11[14],zmm10[14],zmm11[15],zmm10[15]
11068; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11069; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
11070; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
11071; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm11, %zmm23
11072; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11073; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k1}
11074; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm10
11075; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm31, %zmm10
11076; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm10 {%k2}
11077; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm25
11078; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm5, %zmm25
11079; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm25 {%k2}
11080; AVX512BW-NEXT:    vmovdqa64 %zmm30, %zmm26
11081; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} zmm30 = zmm30[2],zmm8[2],zmm30[3],zmm8[3],zmm30[6],zmm8[6],zmm30[7],zmm8[7],zmm30[10],zmm8[10],zmm30[11],zmm8[11],zmm30[14],zmm8[14],zmm30[15],zmm8[15]
11082; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm11, %zmm26
11083; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11084; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
11085; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm0
11086; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm8
11087; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm31, %zmm8
11088; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
11089; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm8 {%k2}
11090; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm9
11091; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm5, %zmm9
11092; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm9 {%k2}
11093; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm27
11094; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
11095; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm11, %zmm27
11096; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11097; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
11098; AVX512BW-NEXT:    vpermi2d %zmm7, %zmm12, %zmm31
11099; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11100; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k2}
11101; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15]
11102; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm16
11103; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm10
11104; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm8
11105; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm1, %zmm31
11106; AVX512BW-NEXT:    vpermi2d %zmm7, %zmm12, %zmm5
11107; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
11108; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k2}
11109; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15]
11110; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm20
11111; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm25
11112; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm9
11113; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm1, %zmm5
11114; AVX512BW-NEXT:    vpermi2d %zmm7, %zmm12, %zmm11
11115; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11116; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm11 {%k1}
11117; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15]
11118; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm23
11119; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm26
11120; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm27
11121; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm1, %zmm11
11122; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
11123; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
11124; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[2,3,2,3,2,3,2,3]
11125; AVX512BW-NEXT:    vmovdqa 64(%rdi), %ymm1
11126; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
11127; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[2,3,2,3,2,3,2,3]
11128; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm1
11129; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
11130; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm1[2,3,2,3,2,3,2,3]
11131; AVX512BW-NEXT:    vmovdqa 192(%rdi), %ymm1
11132; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
11133; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[2,3,2,3,2,3,2,3]
11134; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0]
11135; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm19
11136; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm22
11137; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm18
11138; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm1, %zmm21
11139; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11140; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[6,7,6,7,6,7,6,7]
11141; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0]
11142; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm24
11143; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm30[6,7,6,7,6,7,6,7]
11144; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm17
11145; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm0[6,7,6,7,6,7,6,7]
11146; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm28
11147; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm2
11148; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} zmm3 = zmm12[2],zmm7[2],zmm12[3],zmm7[3],zmm12[6],zmm7[6],zmm12[7],zmm7[7],zmm12[10],zmm7[10],zmm12[11],zmm7[11],zmm12[14],zmm7[14],zmm12[15],zmm7[15]
11149; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm4
11150; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm3[6,7,6,7,6,7,6,7]
11151; AVX512BW-NEXT:    vmovdqa64 128(%r9), %zmm3
11152; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm1, %zmm15
11153; AVX512BW-NEXT:    vmovdqa64 192(%r9), %zmm1
11154; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
11155; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
11156; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm6
11157; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm13
11158; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm14
11159; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm0, %zmm29
11160; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
11161; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm16
11162; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm10
11163; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm8
11164; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm0, %zmm31
11165; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
11166; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm20
11167; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm25
11168; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm9
11169; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm0, %zmm5
11170; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
11171; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm23
11172; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm26
11173; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm27
11174; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm0, %zmm11
11175; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
11176; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm19
11177; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm22
11178; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm18
11179; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm0, %zmm21
11180; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
11181; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm24
11182; AVX512BW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm17
11183; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm28
11184; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm0, %zmm15
11185; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
11186; AVX512BW-NEXT:    vmovdqa64 %zmm15, 1472(%rax)
11187; AVX512BW-NEXT:    vmovdqa64 %zmm11, 1408(%rax)
11188; AVX512BW-NEXT:    vmovdqa64 %zmm5, 1344(%rax)
11189; AVX512BW-NEXT:    vmovdqa64 %zmm31, 1152(%rax)
11190; AVX512BW-NEXT:    vmovdqa64 %zmm28, 1088(%rax)
11191; AVX512BW-NEXT:    vmovdqa64 %zmm27, 1024(%rax)
11192; AVX512BW-NEXT:    vmovdqa64 %zmm9, 960(%rax)
11193; AVX512BW-NEXT:    vmovdqa64 %zmm8, 768(%rax)
11194; AVX512BW-NEXT:    vmovdqa64 %zmm17, 704(%rax)
11195; AVX512BW-NEXT:    vmovdqa64 %zmm26, 640(%rax)
11196; AVX512BW-NEXT:    vmovdqa64 %zmm25, 576(%rax)
11197; AVX512BW-NEXT:    vmovdqa64 %zmm10, 384(%rax)
11198; AVX512BW-NEXT:    vmovdqa64 %zmm24, 320(%rax)
11199; AVX512BW-NEXT:    vmovdqa64 %zmm23, 256(%rax)
11200; AVX512BW-NEXT:    vmovdqa64 %zmm20, 192(%rax)
11201; AVX512BW-NEXT:    vmovdqa64 %zmm16, (%rax)
11202; AVX512BW-NEXT:    vmovdqa64 %zmm21, 1280(%rax)
11203; AVX512BW-NEXT:    vmovdqa64 %zmm29, 1216(%rax)
11204; AVX512BW-NEXT:    vmovdqa64 %zmm18, 896(%rax)
11205; AVX512BW-NEXT:    vmovdqa64 %zmm14, 832(%rax)
11206; AVX512BW-NEXT:    vmovdqa64 %zmm22, 512(%rax)
11207; AVX512BW-NEXT:    vmovdqa64 %zmm13, 448(%rax)
11208; AVX512BW-NEXT:    vmovdqa64 %zmm19, 128(%rax)
11209; AVX512BW-NEXT:    vmovdqa64 %zmm6, 64(%rax)
11210; AVX512BW-NEXT:    addq $456, %rsp # imm = 0x1C8
11211; AVX512BW-NEXT:    vzeroupper
11212; AVX512BW-NEXT:    retq
11213;
11214; AVX512BW-FCP-LABEL: store_i32_stride6_vf64:
11215; AVX512BW-FCP:       # %bb.0:
11216; AVX512BW-FCP-NEXT:    subq $1160, %rsp # imm = 0x488
11217; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm8
11218; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
11219; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm3
11220; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm2
11221; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %zmm0
11222; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm24
11223; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rsi), %zmm29
11224; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rsi), %zmm23
11225; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
11226; AVX512BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm21
11227; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
11228; AVX512BW-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
11229; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm6
11230; AVX512BW-FCP-NEXT:    vpermt2d %zmm21, %zmm20, %zmm6
11231; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11232; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
11233; AVX512BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
11234; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
11235; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm1
11236; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm16
11237; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0]
11238; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
11239; AVX512BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
11240; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
11241; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm10, %zmm1
11242; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm17
11243; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23]
11244; AVX512BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
11245; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm31
11246; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm25
11247; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm1, %zmm31
11248; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
11249; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
11250; AVX512BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
11251; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
11252; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm11, %zmm1
11253; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm27
11254; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
11255; AVX512BW-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
11256; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
11257; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm12, %zmm1
11258; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11259; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31]
11260; AVX512BW-FCP-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
11261; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm30, %zmm25
11262; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
11263; AVX512BW-FCP-NEXT:    vpermt2d %zmm24, %zmm7, %zmm0
11264; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11265; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
11266; AVX512BW-FCP-NEXT:    vpermt2d %zmm24, %zmm10, %zmm0
11267; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11268; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
11269; AVX512BW-FCP-NEXT:    vpermt2d %zmm24, %zmm8, %zmm0
11270; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11271; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
11272; AVX512BW-FCP-NEXT:    vpermt2d %zmm24, %zmm11, %zmm0
11273; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11274; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
11275; AVX512BW-FCP-NEXT:    vpermt2d %zmm24, %zmm12, %zmm0
11276; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11277; AVX512BW-FCP-NEXT:    vpermt2d %zmm24, %zmm30, %zmm5
11278; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11279; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
11280; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
11281; AVX512BW-FCP-NEXT:    vpermt2d %zmm29, %zmm7, %zmm1
11282; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11283; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
11284; AVX512BW-FCP-NEXT:    vpermt2d %zmm29, %zmm10, %zmm1
11285; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11286; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
11287; AVX512BW-FCP-NEXT:    vpermt2d %zmm29, %zmm8, %zmm1
11288; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11289; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm26
11290; AVX512BW-FCP-NEXT:    vpermt2d %zmm29, %zmm11, %zmm26
11291; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm28
11292; AVX512BW-FCP-NEXT:    vpermt2d %zmm29, %zmm12, %zmm28
11293; AVX512BW-FCP-NEXT:    vpermt2d %zmm29, %zmm30, %zmm0
11294; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11295; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm19
11296; AVX512BW-FCP-NEXT:    vpermt2d %zmm21, %zmm9, %zmm19
11297; AVX512BW-FCP-NEXT:    vpermi2d %zmm23, %zmm2, %zmm7
11298; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11299; AVX512BW-FCP-NEXT:    vpermi2d %zmm23, %zmm2, %zmm10
11300; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11301; AVX512BW-FCP-NEXT:    vpermi2d %zmm23, %zmm2, %zmm8
11302; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11303; AVX512BW-FCP-NEXT:    vpermi2d %zmm23, %zmm2, %zmm11
11304; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11305; AVX512BW-FCP-NEXT:    vpermi2d %zmm23, %zmm2, %zmm12
11306; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11307; AVX512BW-FCP-NEXT:    vpermt2d %zmm23, %zmm30, %zmm2
11308; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11309; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22]
11310; AVX512BW-FCP-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
11311; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm5
11312; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm23
11313; AVX512BW-FCP-NEXT:    vpermt2d %zmm21, %zmm29, %zmm23
11314; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26]
11315; AVX512BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
11316; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm18
11317; AVX512BW-FCP-NEXT:    vpermt2d %zmm21, %zmm4, %zmm18
11318; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28]
11319; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm7
11320; AVX512BW-FCP-NEXT:    vpermt2d %zmm21, %zmm8, %zmm7
11321; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30]
11322; AVX512BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
11323; AVX512BW-FCP-NEXT:    vpermt2d %zmm21, %zmm2, %zmm5
11324; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm21
11325; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm0
11326; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm22
11327; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm20, %zmm22
11328; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm14
11329; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm24
11330; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm9, %zmm14
11331; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm30
11332; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm29, %zmm30
11333; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm10
11334; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm4, %zmm10
11335; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm6
11336; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm8, %zmm6
11337; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm21
11338; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rdx), %zmm13
11339; AVX512BW-FCP-NEXT:    vmovdqa64 128(%rcx), %zmm0
11340; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm15
11341; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm20, %zmm15
11342; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm12
11343; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm9, %zmm12
11344; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm3
11345; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm29, %zmm3
11346; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm11
11347; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm4, %zmm11
11348; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm9
11349; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm8, %zmm9
11350; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm13
11351; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdx), %zmm1
11352; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rcx), %zmm0
11353; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm20
11354; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm24
11355; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm29
11356; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm4
11357; AVX512BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
11358; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm1
11359; AVX512BW-FCP-NEXT:    movb $-110, %al
11360; AVX512BW-FCP-NEXT:    kmovd %eax, %k2
11361; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11362; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k2}
11363; AVX512BW-FCP-NEXT:    movb $36, %al
11364; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
11365; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, %zmm17 {%k1}
11366; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm23 {%k1}
11367; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm27 {%k2}
11368; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
11369; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm19 {%k1}
11370; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm5 {%k1}
11371; AVX512BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm0
11372; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15]
11373; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm16
11374; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm16, (%rsp) # 64-byte Spill
11375; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15]
11376; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm17
11377; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11378; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0]
11379; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm23
11380; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm25 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15]
11381; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm27
11382; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11383; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15]
11384; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm17, %zmm19
11385; AVX512BW-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11386; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm31 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0]
11387; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm31, %zmm5
11388; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
11389; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm27 {%k2}
11390; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
11391; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm22 {%k1}
11392; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11393; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k1}
11394; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
11395; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm16 {%k2}
11396; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
11397; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm10 {%k1}
11398; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11399; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
11400; AVX512BW-FCP-NEXT:    vmovdqa64 64(%r8), %zmm0
11401; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm27
11402; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm22
11403; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm30
11404; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm16
11405; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm19
11406; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm17, %zmm10
11407; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm14
11408; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm17
11409; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm31, %zmm21
11410; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
11411; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm16 {%k2}
11412; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
11413; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm15 {%k1}
11414; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11415; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k1}
11416; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm26 {%k2}
11417; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm28 {%k1}
11418; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11419; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
11420; AVX512BW-FCP-NEXT:    vmovdqa64 128(%r8), %zmm0
11421; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm16
11422; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm15
11423; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm3
11424; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm26
11425; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm14, %zmm28
11426; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm31, %zmm13
11427; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
11428; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm20, %zmm6 {%k2}
11429; AVX512BW-FCP-NEXT:    vmovdqa64 192(%r8), %zmm0
11430; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm6
11431; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm9
11432; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
11433; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm2 {%k1}
11434; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm2
11435; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm10
11436; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
11437; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm29 {%k1}
11438; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm29
11439; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
11440; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k2}
11441; AVX512BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm2
11442; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm6
11443; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm12
11444; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
11445; AVX512BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm18 # 64-byte Reload
11446; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm4, %zmm18
11447; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
11448; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm6 {%k1}
11449; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
11450; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
11451; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm11, %zmm20
11452; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm14, %zmm6
11453; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm14
11454; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
11455; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm6, %zmm23
11456; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11457; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm1 {%k1}
11458; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
11459; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
11460; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm7, %zmm24
11461; AVX512BW-FCP-NEXT:    vpermt2d %zmm0, %zmm31, %zmm1
11462; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
11463; AVX512BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
11464; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm25
11465; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
11466; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm8, %zmm5
11467; AVX512BW-FCP-NEXT:    vmovdqa64 64(%r9), %zmm2
11468; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm4, %zmm27
11469; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm11, %zmm22
11470; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm6, %zmm30
11471; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm7, %zmm19
11472; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm17
11473; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm8, %zmm21
11474; AVX512BW-FCP-NEXT:    vmovdqa64 128(%r9), %zmm2
11475; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm4, %zmm16
11476; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm11, %zmm15
11477; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm6, %zmm3
11478; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm7, %zmm26
11479; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm28
11480; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm8, %zmm13
11481; AVX512BW-FCP-NEXT:    vmovdqa64 192(%r9), %zmm2
11482; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm4, %zmm9
11483; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm11, %zmm10
11484; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm6, %zmm29
11485; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm7, %zmm12
11486; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm14
11487; AVX512BW-FCP-NEXT:    vpermt2d %zmm2, %zmm8, %zmm1
11488; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
11489; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, 1472(%rax)
11490; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, 1408(%rax)
11491; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, 1344(%rax)
11492; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm29, 1280(%rax)
11493; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, 1216(%rax)
11494; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, 1152(%rax)
11495; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, 1088(%rax)
11496; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm28, 1024(%rax)
11497; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm26, 960(%rax)
11498; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, 896(%rax)
11499; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, 832(%rax)
11500; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, 768(%rax)
11501; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, 704(%rax)
11502; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, 640(%rax)
11503; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, 576(%rax)
11504; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm30, 512(%rax)
11505; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, 448(%rax)
11506; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm27, 384(%rax)
11507; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, 320(%rax)
11508; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm25, 256(%rax)
11509; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm24, 192(%rax)
11510; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm23, 128(%rax)
11511; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm20, 64(%rax)
11512; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, (%rax)
11513; AVX512BW-FCP-NEXT:    addq $1160, %rsp # imm = 0x488
11514; AVX512BW-FCP-NEXT:    vzeroupper
11515; AVX512BW-FCP-NEXT:    retq
11516;
11517; AVX512DQ-BW-LABEL: store_i32_stride6_vf64:
11518; AVX512DQ-BW:       # %bb.0:
11519; AVX512DQ-BW-NEXT:    subq $456, %rsp # imm = 0x1C8
11520; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdi), %zmm9
11521; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdi), %zmm30
11522; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %zmm11
11523; AVX512DQ-BW-NEXT:    vmovdqa64 (%rsi), %zmm10
11524; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rsi), %zmm8
11525; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rsi), %zmm4
11526; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdx), %zmm15
11527; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rdx), %zmm28
11528; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rdx), %zmm17
11529; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdx), %zmm24
11530; AVX512DQ-BW-NEXT:    vmovdqa64 (%rcx), %zmm22
11531; AVX512DQ-BW-NEXT:    vmovdqa64 64(%rcx), %zmm18
11532; AVX512DQ-BW-NEXT:    vmovdqa64 128(%rcx), %zmm16
11533; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rcx), %zmm13
11534; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
11535; AVX512DQ-BW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
11536; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm0
11537; AVX512DQ-BW-NEXT:    vpermt2d %zmm10, %zmm29, %zmm0
11538; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm3
11539; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %ymm14
11540; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
11541; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
11542; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm24, %zmm20
11543; AVX512DQ-BW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm20
11544; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26]
11545; AVX512DQ-BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
11546; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm24, %zmm23
11547; AVX512DQ-BW-NEXT:    vpermt2d %zmm22, %zmm6, %zmm23
11548; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28]
11549; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm24, %zmm5
11550; AVX512DQ-BW-NEXT:    vpermt2d %zmm22, %zmm2, %zmm5
11551; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11552; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm25
11553; AVX512DQ-BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm25
11554; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm26
11555; AVX512DQ-BW-NEXT:    vpermt2d %zmm18, %zmm6, %zmm26
11556; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm5
11557; AVX512DQ-BW-NEXT:    vpermt2d %zmm18, %zmm2, %zmm5
11558; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11559; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm28, %zmm5
11560; AVX512DQ-BW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm5
11561; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11562; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm5
11563; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm28, %zmm27
11564; AVX512DQ-BW-NEXT:    vpermt2d %zmm16, %zmm6, %zmm27
11565; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm28, %zmm0
11566; AVX512DQ-BW-NEXT:    vpermt2d %zmm16, %zmm2, %zmm0
11567; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11568; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm21 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22]
11569; AVX512DQ-BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3]
11570; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm24, %zmm19
11571; AVX512DQ-BW-NEXT:    vpermt2d %zmm22, %zmm21, %zmm19
11572; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30]
11573; AVX512DQ-BW-NEXT:    # zmm0 = mem[0,1,2,3,0,1,2,3]
11574; AVX512DQ-BW-NEXT:    vpermt2d %zmm22, %zmm0, %zmm24
11575; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, %zmm22
11576; AVX512DQ-BW-NEXT:    vpermt2d %zmm18, %zmm21, %zmm22
11577; AVX512DQ-BW-NEXT:    vpermt2d %zmm18, %zmm0, %zmm17
11578; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm28, %zmm18
11579; AVX512DQ-BW-NEXT:    vpermt2d %zmm16, %zmm21, %zmm18
11580; AVX512DQ-BW-NEXT:    vpermt2d %zmm16, %zmm0, %zmm28
11581; AVX512DQ-BW-NEXT:    vmovdqa 64(%rdx), %ymm1
11582; AVX512DQ-BW-NEXT:    vpermi2d %zmm13, %zmm15, %zmm5
11583; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11584; AVX512DQ-BW-NEXT:    vpermi2d %zmm13, %zmm15, %zmm6
11585; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm6, (%rsp) # 64-byte Spill
11586; AVX512DQ-BW-NEXT:    vpermi2d %zmm13, %zmm15, %zmm2
11587; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11588; AVX512DQ-BW-NEXT:    vpermi2d %zmm13, %zmm15, %zmm21
11589; AVX512DQ-BW-NEXT:    vpermt2d %zmm13, %zmm0, %zmm15
11590; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdx), %ymm0
11591; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [3,11,0,8,7,15,4,12]
11592; AVX512DQ-BW-NEXT:    vpermt2d (%rcx), %ymm2, %ymm14
11593; AVX512DQ-BW-NEXT:    movb $36, %al
11594; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
11595; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm14[0,1,0,1,2,3,6,7]
11596; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm30, %zmm13
11597; AVX512DQ-BW-NEXT:    vpermt2d %zmm8, %zmm29, %zmm13
11598; AVX512DQ-BW-NEXT:    vpermt2d 64(%rcx), %ymm2, %ymm1
11599; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[0,1,0,1,2,3,6,7]
11600; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm14
11601; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, %zmm1
11602; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm29, %zmm14
11603; AVX512DQ-BW-NEXT:    vpermt2d 128(%rcx), %ymm2, %ymm0
11604; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm0[0,1,0,1,2,3,6,7]
11605; AVX512DQ-BW-NEXT:    vmovdqa 192(%rdx), %ymm0
11606; AVX512DQ-BW-NEXT:    vpermt2d 192(%rcx), %ymm2, %ymm0
11607; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rdi), %zmm12
11608; AVX512DQ-BW-NEXT:    vmovdqa64 192(%rsi), %zmm7
11609; AVX512DQ-BW-NEXT:    vpermi2d %zmm7, %zmm12, %zmm29
11610; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm0[0,1,0,1,2,3,6,7]
11611; AVX512DQ-BW-NEXT:    vmovdqa64 (%r8), %zmm4
11612; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15]
11613; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm3
11614; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11615; AVX512DQ-BW-NEXT:    vmovdqa64 64(%r8), %zmm3
11616; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm13
11617; AVX512DQ-BW-NEXT:    vmovdqa64 128(%r8), %zmm2
11618; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm14
11619; AVX512DQ-BW-NEXT:    vmovdqa64 192(%r8), %zmm6
11620; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm0, %zmm29
11621; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm16
11622; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm31 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
11623; AVX512DQ-BW-NEXT:    # zmm31 = mem[0,1,2,3,0,1,2,3]
11624; AVX512DQ-BW-NEXT:    vpermt2d %zmm10, %zmm31, %zmm16
11625; AVX512DQ-BW-NEXT:    movb $-110, %al
11626; AVX512DQ-BW-NEXT:    kmovd %eax, %k2
11627; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm20, %zmm16 {%k2}
11628; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm20
11629; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
11630; AVX512DQ-BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
11631; AVX512DQ-BW-NEXT:    vpermt2d %zmm10, %zmm5, %zmm20
11632; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm23, %zmm20 {%k2}
11633; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, %zmm23
11634; AVX512DQ-BW-NEXT:    vpunpckhdq {{.*#+}} zmm0 = zmm11[2],zmm10[2],zmm11[3],zmm10[3],zmm11[6],zmm10[6],zmm11[7],zmm10[7],zmm11[10],zmm10[10],zmm11[11],zmm10[11],zmm11[14],zmm10[14],zmm11[15],zmm10[15]
11635; AVX512DQ-BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11636; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
11637; AVX512DQ-BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
11638; AVX512DQ-BW-NEXT:    vpermt2d %zmm10, %zmm11, %zmm23
11639; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11640; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm23 {%k1}
11641; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm30, %zmm10
11642; AVX512DQ-BW-NEXT:    vpermt2d %zmm8, %zmm31, %zmm10
11643; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm25, %zmm10 {%k2}
11644; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm30, %zmm25
11645; AVX512DQ-BW-NEXT:    vpermt2d %zmm8, %zmm5, %zmm25
11646; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm26, %zmm25 {%k2}
11647; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm30, %zmm26
11648; AVX512DQ-BW-NEXT:    vpunpckhdq {{.*#+}} zmm30 = zmm30[2],zmm8[2],zmm30[3],zmm8[3],zmm30[6],zmm8[6],zmm30[7],zmm8[7],zmm30[10],zmm8[10],zmm30[11],zmm8[11],zmm30[14],zmm8[14],zmm30[15],zmm8[15]
11649; AVX512DQ-BW-NEXT:    vpermt2d %zmm8, %zmm11, %zmm26
11650; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11651; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm26 {%k1}
11652; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm0
11653; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm8
11654; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm31, %zmm8
11655; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
11656; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, %zmm8 {%k2}
11657; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm9
11658; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm5, %zmm9
11659; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm27, %zmm9 {%k2}
11660; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, %zmm27
11661; AVX512DQ-BW-NEXT:    vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
11662; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm11, %zmm27
11663; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11664; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm27 {%k1}
11665; AVX512DQ-BW-NEXT:    vpermi2d %zmm7, %zmm12, %zmm31
11666; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11667; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k2}
11668; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15]
11669; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm16
11670; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm10
11671; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm8
11672; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm1, %zmm31
11673; AVX512DQ-BW-NEXT:    vpermi2d %zmm7, %zmm12, %zmm5
11674; AVX512DQ-BW-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
11675; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k2}
11676; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15]
11677; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm20
11678; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm25
11679; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm9
11680; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm1, %zmm5
11681; AVX512DQ-BW-NEXT:    vpermi2d %zmm7, %zmm12, %zmm11
11682; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11683; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm1, %zmm11 {%k1}
11684; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15]
11685; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm23
11686; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm26
11687; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm27
11688; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm1, %zmm11
11689; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm1
11690; AVX512DQ-BW-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
11691; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[2,3,2,3,2,3,2,3]
11692; AVX512DQ-BW-NEXT:    vmovdqa 64(%rdi), %ymm1
11693; AVX512DQ-BW-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
11694; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[2,3,2,3,2,3,2,3]
11695; AVX512DQ-BW-NEXT:    vmovdqa 128(%rdi), %ymm1
11696; AVX512DQ-BW-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
11697; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm1[2,3,2,3,2,3,2,3]
11698; AVX512DQ-BW-NEXT:    vmovdqa 192(%rdi), %ymm1
11699; AVX512DQ-BW-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
11700; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[2,3,2,3,2,3,2,3]
11701; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0]
11702; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm19
11703; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm22
11704; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm18
11705; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm1, %zmm21
11706; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11707; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[6,7,6,7,6,7,6,7]
11708; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0]
11709; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm24
11710; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm30[6,7,6,7,6,7,6,7]
11711; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm17
11712; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm0[6,7,6,7,6,7,6,7]
11713; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm28
11714; AVX512DQ-BW-NEXT:    vmovdqa64 (%r9), %zmm2
11715; AVX512DQ-BW-NEXT:    vpunpckhdq {{.*#+}} zmm3 = zmm12[2],zmm7[2],zmm12[3],zmm7[3],zmm12[6],zmm7[6],zmm12[7],zmm7[7],zmm12[10],zmm7[10],zmm12[11],zmm7[11],zmm12[14],zmm7[14],zmm12[15],zmm7[15]
11716; AVX512DQ-BW-NEXT:    vmovdqa64 64(%r9), %zmm4
11717; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm3[6,7,6,7,6,7,6,7]
11718; AVX512DQ-BW-NEXT:    vmovdqa64 128(%r9), %zmm3
11719; AVX512DQ-BW-NEXT:    vpermt2d %zmm6, %zmm1, %zmm15
11720; AVX512DQ-BW-NEXT:    vmovdqa64 192(%r9), %zmm1
11721; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
11722; AVX512DQ-BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
11723; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm6
11724; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm13
11725; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm14
11726; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm0, %zmm29
11727; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
11728; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm16
11729; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm10
11730; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm8
11731; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm0, %zmm31
11732; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
11733; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm20
11734; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm25
11735; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm9
11736; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm0, %zmm5
11737; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
11738; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm23
11739; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm26
11740; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm27
11741; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm0, %zmm11
11742; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
11743; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm19
11744; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm22
11745; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm18
11746; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm0, %zmm21
11747; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
11748; AVX512DQ-BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm24
11749; AVX512DQ-BW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm17
11750; AVX512DQ-BW-NEXT:    vpermt2d %zmm3, %zmm0, %zmm28
11751; AVX512DQ-BW-NEXT:    vpermt2d %zmm1, %zmm0, %zmm15
11752; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
11753; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, 1472(%rax)
11754; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, 1408(%rax)
11755; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, 1344(%rax)
11756; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm31, 1152(%rax)
11757; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm28, 1088(%rax)
11758; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm27, 1024(%rax)
11759; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, 960(%rax)
11760; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm8, 768(%rax)
11761; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, 704(%rax)
11762; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm26, 640(%rax)
11763; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm25, 576(%rax)
11764; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, 384(%rax)
11765; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm24, 320(%rax)
11766; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm23, 256(%rax)
11767; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm20, 192(%rax)
11768; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, (%rax)
11769; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm21, 1280(%rax)
11770; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm29, 1216(%rax)
11771; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, 896(%rax)
11772; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, 832(%rax)
11773; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm22, 512(%rax)
11774; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm13, 448(%rax)
11775; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm19, 128(%rax)
11776; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, 64(%rax)
11777; AVX512DQ-BW-NEXT:    addq $456, %rsp # imm = 0x1C8
11778; AVX512DQ-BW-NEXT:    vzeroupper
11779; AVX512DQ-BW-NEXT:    retq
11780;
11781; AVX512DQ-BW-FCP-LABEL: store_i32_stride6_vf64:
11782; AVX512DQ-BW-FCP:       # %bb.0:
11783; AVX512DQ-BW-FCP-NEXT:    subq $1160, %rsp # imm = 0x488
11784; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %zmm8
11785; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm5
11786; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdi), %zmm3
11787; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %zmm2
11788; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rsi), %zmm0
11789; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rsi), %zmm24
11790; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rsi), %zmm29
11791; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rsi), %zmm23
11792; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdx), %zmm4
11793; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rcx), %zmm21
11794; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18]
11795; AVX512DQ-BW-FCP-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
11796; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm6
11797; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm21, %zmm20, %zmm6
11798; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11799; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17]
11800; AVX512DQ-BW-FCP-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
11801; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
11802; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm1
11803; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm16
11804; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0]
11805; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
11806; AVX512DQ-BW-FCP-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
11807; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm1
11808; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm10, %zmm1
11809; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm17
11810; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23]
11811; AVX512DQ-BW-FCP-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
11812; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm31
11813; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm25
11814; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm1, %zmm31
11815; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm8
11816; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25]
11817; AVX512DQ-BW-FCP-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
11818; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
11819; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm11, %zmm1
11820; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, %zmm27
11821; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
11822; AVX512DQ-BW-FCP-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
11823; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm1
11824; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm12, %zmm1
11825; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11826; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31]
11827; AVX512DQ-BW-FCP-NEXT:    # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
11828; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm30, %zmm25
11829; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
11830; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm24, %zmm7, %zmm0
11831; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11832; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
11833; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm24, %zmm10, %zmm0
11834; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11835; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
11836; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm24, %zmm8, %zmm0
11837; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11838; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
11839; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm24, %zmm11, %zmm0
11840; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11841; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm0
11842; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm24, %zmm12, %zmm0
11843; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11844; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm24, %zmm30, %zmm5
11845; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11846; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0
11847; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
11848; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm29, %zmm7, %zmm1
11849; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11850; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
11851; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm29, %zmm10, %zmm1
11852; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11853; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1
11854; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm29, %zmm8, %zmm1
11855; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11856; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm26
11857; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm29, %zmm11, %zmm26
11858; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, %zmm28
11859; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm29, %zmm12, %zmm28
11860; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm29, %zmm30, %zmm0
11861; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11862; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm19
11863; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm21, %zmm9, %zmm19
11864; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm23, %zmm2, %zmm7
11865; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11866; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm23, %zmm2, %zmm10
11867; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11868; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm23, %zmm2, %zmm8
11869; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11870; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm23, %zmm2, %zmm11
11871; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11872; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm23, %zmm2, %zmm12
11873; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11874; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm23, %zmm30, %zmm2
11875; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11876; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm29 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22]
11877; AVX512DQ-BW-FCP-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3]
11878; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm5
11879; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm23
11880; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm21, %zmm29, %zmm23
11881; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26]
11882; AVX512DQ-BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
11883; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm18
11884; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm21, %zmm4, %zmm18
11885; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28]
11886; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, %zmm7
11887; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm21, %zmm8, %zmm7
11888; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30]
11889; AVX512DQ-BW-FCP-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
11890; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm21, %zmm2, %zmm5
11891; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdx), %zmm21
11892; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rcx), %zmm0
11893; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm22
11894; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm20, %zmm22
11895; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm14
11896; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm24
11897; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm9, %zmm14
11898; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm30
11899; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm29, %zmm30
11900; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm10
11901; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm4, %zmm10
11902; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, %zmm6
11903; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm8, %zmm6
11904; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm21
11905; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rdx), %zmm13
11906; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%rcx), %zmm0
11907; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm15
11908; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm20, %zmm15
11909; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm12
11910; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm9, %zmm12
11911; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm3
11912; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm29, %zmm3
11913; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm11
11914; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm4, %zmm11
11915; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, %zmm9
11916; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm8, %zmm9
11917; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm13
11918; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdx), %zmm1
11919; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rcx), %zmm0
11920; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm20
11921; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm24
11922; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm29
11923; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm4
11924; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
11925; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm1
11926; AVX512DQ-BW-FCP-NEXT:    movb $-110, %al
11927; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k2
11928; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11929; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k2}
11930; AVX512DQ-BW-FCP-NEXT:    movb $36, %al
11931; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
11932; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, %zmm17 {%k1}
11933; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm31, %zmm23 {%k1}
11934; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, %zmm27 {%k2}
11935; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
11936; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm19 {%k1}
11937; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, %zmm5 {%k1}
11938; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm0
11939; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,0,6,7,8,9,17,0,12,13,14,15]
11940; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm16
11941; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm16, (%rsp) # 64-byte Spill
11942; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [18,0,2,3,4,5,19,0,8,9,10,11,20,0,14,15]
11943; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm17
11944; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11945; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,1,21,0,4,5,6,7,22,0,10,11,12,13,23,0]
11946; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm23
11947; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm25 = [0,1,2,3,24,0,6,7,8,9,25,0,12,13,14,15]
11948; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm27
11949; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11950; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [26,0,2,3,4,5,27,0,8,9,10,11,28,0,14,15]
11951; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm17, %zmm19
11952; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11953; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm31 = [0,1,29,0,4,5,6,7,30,0,10,11,12,13,31,0]
11954; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm31, %zmm5
11955; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
11956; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, %zmm27 {%k2}
11957; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
11958; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, %zmm22 {%k1}
11959; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11960; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm30 {%k1}
11961; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
11962; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm16 {%k2}
11963; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
11964; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm10 {%k1}
11965; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11966; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm21 {%k1}
11967; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%r8), %zmm0
11968; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm27
11969; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm22
11970; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm30
11971; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm16
11972; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, %zmm19
11973; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm17, %zmm10
11974; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, %zmm14
11975; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, %zmm17
11976; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm31, %zmm21
11977; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
11978; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, %zmm16 {%k2}
11979; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
11980; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, %zmm15 {%k1}
11981; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11982; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm3 {%k1}
11983; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm11, %zmm26 {%k2}
11984; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, %zmm28 {%k1}
11985; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11986; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
11987; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%r8), %zmm0
11988; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm16
11989; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm15
11990; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm3
11991; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm26
11992; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm14, %zmm28
11993; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm31, %zmm13
11994; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
11995; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm20, %zmm6 {%k2}
11996; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%r8), %zmm0
11997; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm2, %zmm6
11998; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm9
11999; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
12000; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm24, %zmm2 {%k1}
12001; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm7, %zmm2
12002; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm10
12003; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
12004; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, %zmm29 {%k1}
12005; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm29
12006; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
12007; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k2}
12008; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm2
12009; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm25, %zmm6
12010; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm12
12011; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
12012; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 (%rsp), %zmm18 # 64-byte Reload
12013; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm4, %zmm18
12014; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
12015; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, %zmm6 {%k1}
12016; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
12017; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
12018; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm11, %zmm20
12019; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm14, %zmm6
12020; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, %zmm14
12021; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
12022; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm6, %zmm23
12023; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
12024; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, %zmm1 {%k1}
12025; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
12026; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
12027; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm7, %zmm24
12028; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm0, %zmm31, %zmm1
12029; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
12030; AVX512DQ-BW-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
12031; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm25
12032; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
12033; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm8, %zmm5
12034; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%r9), %zmm2
12035; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm4, %zmm27
12036; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm11, %zmm22
12037; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm6, %zmm30
12038; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm7, %zmm19
12039; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm17
12040; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm8, %zmm21
12041; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 128(%r9), %zmm2
12042; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm4, %zmm16
12043; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm11, %zmm15
12044; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm6, %zmm3
12045; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm7, %zmm26
12046; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm28
12047; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm8, %zmm13
12048; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%r9), %zmm2
12049; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm4, %zmm9
12050; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm11, %zmm10
12051; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm6, %zmm29
12052; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm7, %zmm12
12053; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm0, %zmm14
12054; AVX512DQ-BW-FCP-NEXT:    vpermt2d %zmm2, %zmm8, %zmm1
12055; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
12056; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, 1472(%rax)
12057; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, 1408(%rax)
12058; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, 1344(%rax)
12059; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm29, 1280(%rax)
12060; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, 1216(%rax)
12061; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, 1152(%rax)
12062; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, 1088(%rax)
12063; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm28, 1024(%rax)
12064; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm26, 960(%rax)
12065; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, 896(%rax)
12066; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, 832(%rax)
12067; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, 768(%rax)
12068; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, 704(%rax)
12069; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, 640(%rax)
12070; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, 576(%rax)
12071; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm30, 512(%rax)
12072; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, 448(%rax)
12073; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm27, 384(%rax)
12074; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, 320(%rax)
12075; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm25, 256(%rax)
12076; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm24, 192(%rax)
12077; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm23, 128(%rax)
12078; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm20, 64(%rax)
12079; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, (%rax)
12080; AVX512DQ-BW-FCP-NEXT:    addq $1160, %rsp # imm = 0x488
12081; AVX512DQ-BW-FCP-NEXT:    vzeroupper
12082; AVX512DQ-BW-FCP-NEXT:    retq
12083  %in.vec0 = load <64 x i32>, ptr %in.vecptr0, align 64
12084  %in.vec1 = load <64 x i32>, ptr %in.vecptr1, align 64
12085  %in.vec2 = load <64 x i32>, ptr %in.vecptr2, align 64
12086  %in.vec3 = load <64 x i32>, ptr %in.vecptr3, align 64
12087  %in.vec4 = load <64 x i32>, ptr %in.vecptr4, align 64
12088  %in.vec5 = load <64 x i32>, ptr %in.vecptr5, align 64
12089  %1 = shufflevector <64 x i32> %in.vec0, <64 x i32> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
12090  %2 = shufflevector <64 x i32> %in.vec2, <64 x i32> %in.vec3, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
12091  %3 = shufflevector <64 x i32> %in.vec4, <64 x i32> %in.vec5, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
12092  %4 = shufflevector <128 x i32> %1, <128 x i32> %2, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255>
12093  %5 = shufflevector <128 x i32> %3, <128 x i32> poison, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
12094  %6 = shufflevector <256 x i32> %4, <256 x i32> %5, <384 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255, i32 256, i32 257, i32 258, i32 259, i32 260, i32 261, i32 262, i32 263, i32 264, i32 265, i32 266, i32 267, i32 268, i32 269, i32 270, i32 271, i32 272, i32 273, i32 274, i32 275, i32 276, i32 277, i32 278, i32 279, i32 280, i32 281, i32 282, i32 283, i32 284, i32 285, i32 286, i32 287, i32 288, i32 289, i32 290, i32 291, i32 292, i32 293, i32 294, i32 295, i32 296, i32 297, i32 298, i32 299, i32 300, i32 301, i32 302, i32 303, i32 304, i32 305, i32 306, i32 307, i32 308, i32 309, i32 310, i32 311, i32 312, i32 313, i32 314, i32 315, i32 316, i32 317, i32 318, i32 319, i32 320, i32 321, i32 322, i32 323, i32 324, i32 325, i32 326, i32 327, i32 328, i32 329, i32 330, i32 331, i32 332, i32 333, i32 334, i32 335, i32 336, i32 337, i32 338, i32 339, i32 340, i32 341, i32 342, i32 343, i32 344, i32 345, i32 346, i32 347, i32 348, i32 349, i32 350, i32 351, i32 352, i32 353, i32 354, i32 355, i32 356, i32 357, i32 358, i32 359, i32 360, i32 361, i32 362, i32 363, i32 364, i32 365, i32 366, i32 367, i32 368, i32 369, i32 370, i32 371, i32 372, i32 373, i32 374, i32 375, i32 376, i32 377, i32 378, i32 379, i32 380, i32 381, i32 382, i32 383>
12095  %interleaved.vec = shufflevector <384 x i32> %6, <384 x i32> poison, <384 x i32> <i32 0, i32 64, i32 128, i32 192, i32 256, i32 320, i32 1, i32 65, i32 129, i32 193, i32 257, i32 321, i32 2, i32 66, i32 130, i32 194, i32 258, i32 322, i32 3, i32 67, i32 131, i32 195, i32 259, i32 323, i32 4, i32 68, i32 132, i32 196, i32 260, i32 324, i32 5, i32 69, i32 133, i32 197, i32 261, i32 325, i32 6, i32 70, i32 134, i32 198, i32 262, i32 326, i32 7, i32 71, i32 135, i32 199, i32 263, i32 327, i32 8, i32 72, i32 136, i32 200, i32 264, i32 328, i32 9, i32 73, i32 137, i32 201, i32 265, i32 329, i32 10, i32 74, i32 138, i32 202, i32 266, i32 330, i32 11, i32 75, i32 139, i32 203, i32 267, i32 331, i32 12, i32 76, i32 140, i32 204, i32 268, i32 332, i32 13, i32 77, i32 141, i32 205, i32 269, i32 333, i32 14, i32 78, i32 142, i32 206, i32 270, i32 334, i32 15, i32 79, i32 143, i32 207, i32 271, i32 335, i32 16, i32 80, i32 144, i32 208, i32 272, i32 336, i32 17, i32 81, i32 145, i32 209, i32 273, i32 337, i32 18, i32 82, i32 146, i32 210, i32 274, i32 338, i32 19, i32 83, i32 147, i32 211, i32 275, i32 339, i32 20, i32 84, i32 148, i32 212, i32 276, i32 340, i32 21, i32 85, i32 149, i32 213, i32 277, i32 341, i32 22, i32 86, i32 150, i32 214, i32 278, i32 342, i32 23, i32 87, i32 151, i32 215, i32 279, i32 343, i32 24, i32 88, i32 152, i32 216, i32 280, i32 344, i32 25, i32 89, i32 153, i32 217, i32 281, i32 345, i32 26, i32 90, i32 154, i32 218, i32 282, i32 346, i32 27, i32 91, i32 155, i32 219, i32 283, i32 347, i32 28, i32 92, i32 156, i32 220, i32 284, i32 348, i32 29, i32 93, i32 157, i32 221, i32 285, i32 349, i32 30, i32 94, i32 158, i32 222, i32 286, i32 350, i32 31, i32 95, i32 159, i32 223, i32 287, i32 351, i32 32, i32 96, i32 160, i32 224, i32 288, i32 352, i32 33, i32 97, i32 161, i32 225, i32 289, i32 353, i32 34, i32 98, i32 162, i32 226, i32 290, i32 354, i32 35, i32 99, i32 163, i32 227, i32 291, i32 355, i32 36, i32 100, i32 164, i32 228, i32 292, i32 356, i32 37, i32 101, i32 165, i32 229, i32 293, i32 357, i32 38, i32 102, i32 166, i32 230, i32 294, i32 358, i32 39, i32 103, i32 167, i32 231, i32 295, i32 359, i32 40, i32 104, i32 168, i32 232, i32 296, i32 360, i32 41, i32 105, i32 169, i32 233, i32 297, i32 361, i32 42, i32 106, i32 170, i32 234, i32 298, i32 362, i32 43, i32 107, i32 171, i32 235, i32 299, i32 363, i32 44, i32 108, i32 172, i32 236, i32 300, i32 364, i32 45, i32 109, i32 173, i32 237, i32 301, i32 365, i32 46, i32 110, i32 174, i32 238, i32 302, i32 366, i32 47, i32 111, i32 175, i32 239, i32 303, i32 367, i32 48, i32 112, i32 176, i32 240, i32 304, i32 368, i32 49, i32 113, i32 177, i32 241, i32 305, i32 369, i32 50, i32 114, i32 178, i32 242, i32 306, i32 370, i32 51, i32 115, i32 179, i32 243, i32 307, i32 371, i32 52, i32 116, i32 180, i32 244, i32 308, i32 372, i32 53, i32 117, i32 181, i32 245, i32 309, i32 373, i32 54, i32 118, i32 182, i32 246, i32 310, i32 374, i32 55, i32 119, i32 183, i32 247, i32 311, i32 375, i32 56, i32 120, i32 184, i32 248, i32 312, i32 376, i32 57, i32 121, i32 185, i32 249, i32 313, i32 377, i32 58, i32 122, i32 186, i32 250, i32 314, i32 378, i32 59, i32 123, i32 187, i32 251, i32 315, i32 379, i32 60, i32 124, i32 188, i32 252, i32 316, i32 380, i32 61, i32 125, i32 189, i32 253, i32 317, i32 381, i32 62, i32 126, i32 190, i32 254, i32 318, i32 382, i32 63, i32 127, i32 191, i32 255, i32 319, i32 383>
12096  store <384 x i32> %interleaved.vec, ptr %out.vec, align 64
12097  ret void
12098}
12099